diff --git a/code/jasmin/mlkem_avx2/extraction/jkem_avx2.ec b/code/jasmin/mlkem_avx2/extraction/jkem_avx2.ec index 67152bbf..56326c2d 100644 --- a/code/jasmin/mlkem_avx2/extraction/jkem_avx2.ec +++ b/code/jasmin/mlkem_avx2/extraction/jkem_avx2.ec @@ -9,8 +9,8 @@ Array1 Array2 Array4 Array5 Array6 Array7 Array8 Array16 Array24 Array25 Array32 Array64 Array128 Array256 Array400 Array536 Array768 Array960 Array1024 Array1088 Array2048 Array2144 Array2304 WArray1 WArray2 WArray4 WArray8 WArray16 WArray32 WArray64 WArray128 WArray160 WArray192 WArray200 -WArray224 WArray256 WArray512 WArray536 WArray768 WArray800 WArray960 -WArray1088 WArray1536 WArray2048 WArray2144 WArray4608. +WArray224 WArray256 WArray512 WArray536 WArray800 WArray960 WArray1088 +WArray1536 WArray2048 WArray2144 WArray4608. abbrev gen_matrix_indexes = (Array16.of_list witness @@ -635,22 +635,17 @@ abbrev pc_mask_s = (W16.of_int 15). abbrev pc_shift1_s = (W16.of_int 512). -abbrev shake_sep = -(Array4.of_list witness -[(W64.of_int (-9223372036854775808)); (W64.of_int (-9223372036854775808)); -(W64.of_int (-9223372036854775808)); (W64.of_int (-9223372036854775808))]). - -abbrev nEW_KECCAK__ROL8 = +abbrev rOL8 = (W256.of_int 13620818001941277694121380808605999856886653716761013959207994299728839901191 ). -abbrev nEW_KECCAK__ROL56 = +abbrev rOL56 = (W256.of_int 10910488462195273559651782724632284871561478246514020268633800075540923875841 ). -abbrev nEW_KECCAK__KECCAK_RHOTATES_RIGHT = +abbrev kECCAK_RHOTATES_RIGHT = (Array6.of_list witness [(W256.of_int 144373339913893657577751063007562604548177214458152943091773); (W256.of_int 232252764209307188274174373867837442080505530800860351692863); @@ -659,7 +654,7 @@ abbrev nEW_KECCAK__KECCAK_RHOTATES_RIGHT = (W256.of_int 276192476357013953622045746931053922384479139705868246843454); (W256.of_int 313855086769334038206421612937983674734430261968315659321364)]). -abbrev nEW_KECCAK__KECCAK_RHOTATES_LEFT = +abbrev kECCAK_RHOTATES_LEFT = (Array6.of_list witness [(W256.of_int 257361171150853911329517531560668107745210100483895842570243); (W256.of_int 169481746855440380633094220700393270212881784141188433969153); @@ -668,94 +663,7 @@ abbrev nEW_KECCAK__KECCAK_RHOTATES_LEFT = (W256.of_int 125542034707733615285222847637176789908908175236180538818562); (W256.of_int 87879424295413530700846981630247037558957052973733126340652)]). -abbrev nEW_KECCAK__KECCAK1600_RC = -(Array24.of_list witness -[(W64.of_int 1); (W64.of_int 32898); (W64.of_int (-9223372036854742902)); -(W64.of_int (-9223372034707259392)); (W64.of_int 32907); -(W64.of_int 2147483649); (W64.of_int (-9223372034707259263)); -(W64.of_int (-9223372036854743031)); (W64.of_int 138); (W64.of_int 136); -(W64.of_int 2147516425); (W64.of_int 2147483658); (W64.of_int 2147516555); -(W64.of_int (-9223372036854775669)); (W64.of_int (-9223372036854742903)); -(W64.of_int (-9223372036854743037)); (W64.of_int (-9223372036854743038)); -(W64.of_int (-9223372036854775680)); (W64.of_int 32778); -(W64.of_int (-9223372034707292150)); (W64.of_int (-9223372034707259263)); -(W64.of_int (-9223372036854742912)); (W64.of_int 2147483649); -(W64.of_int (-9223372034707259384))]). - -abbrev oLD_KECCAK__KeccakF1600RoundConstants = -(Array24.of_list witness -[(W256.of_int 6277101735386680764176071790128604879584176795969512275969); -(W256.of_int 206504092890751023779864409751650843328560248233805014854828162); -(W256.of_int -(-57896044618657891154337237002533387566728630465883811983015055433200855646070) -); -(W256.of_int -(-57896044605177918687001956587831074660851270707671256656745893357814858874880) -); -(W256.of_int 206560586806369503906741994397762000772476505824968740465311883); -(W256.of_int -13479973339852421633450939126351338586088633588469736715148203130881); -(W256.of_int -(-57896044605177917877255832722949256082138009781081227190387086677747775274879) -); -(W256.of_int -(-57896044618657891964083360867415206145441891392473841449373862113267939246071) -); -(W256.of_int 866240039483361945456297907037747473382616397843792694083722); -(W256.of_int 853685836012588583927945763457490263623448044251853669531784); -(W256.of_int -13480179078138900667299665761280331841242166839448401411882560290825); -(W256.of_int -13479973396346337251931066003935984697246077504727327878873813614602); -(W256.of_int -13480179894162126267568165104169664557960801185391384887919156166795); -(W256.of_int -(-57896044618658096836129800417901987324072977609879901317736128966209602322293) -); -(W256.of_int -(-57896044618657891160614338737920068330904702256012416862599232229170367922039) -); -(W256.of_int -(-57896044618657892001745971279735290730498322133245470726878922889085012901885) -); -(W256.of_int -(-57896044618657892008023073015121971494674393923374075606463099685054525177854) -); -(W256.of_int -(-57896044618658096905177919507155475730009767301294554993162073721874237357952) -); -(W256.of_int 205750840682504622088163281136835410743010147018288673381711882); -(W256.of_int -(-57896044605178124312300604384719547540610971740509902075209375727097995067382) -); -(W256.of_int -(-57896044605177917877255832722949256082138009781081227190387086677747775274879) -); -(W256.of_int -(-57896044618657891217108254356400195208489348367169860778856823392895978405760) -); -(W256.of_int -13479973339852421633450939126351338586088633588469736715148203130881); -(W256.of_int -(-57896044605177918636785142704737628547442696386642417620072478990058760667128) -)]). - -abbrev oLD_KECCAK__rho8 = -(W256.of_int -13620818001941277694121380808605999856886653716761013959207994299728839901191 -). - -abbrev oLD_KECCAK__rho56 = -(W256.of_int -10910488462195273559651782724632284871561478246514020268633800075540923875841 -). - -abbrev oLD_KECCAK__shake_sep = -(Array4.of_list witness -[(W64.of_int (-9223372036854775808)); (W64.of_int (-9223372036854775808)); -(W64.of_int (-9223372036854775808)); (W64.of_int (-9223372036854775808))]). - -abbrev oLD_KECCAK__KECCAK1600_RC = +abbrev kECCAK1600_RC = (Array24.of_list witness [(W64.of_int 1); (W64.of_int 32898); (W64.of_int (-9223372036854742902)); (W64.of_int (-9223372034707259392)); (W64.of_int 32907); @@ -1392,12 +1300,12 @@ module M(SC:Syscall_t) = { rd <- (VPSUB_16u16 rhi rlo); return rd; } - proc nEW_KECCAK__keccakf1600_index (x:int, y:int) : int = { + proc keccakf1600_index (x:int, y:int) : int = { var r:int; r <- ((x %% 5) + (5 * (y %% 5))); return r; } - proc nEW_KECCAK__keccakf1600_rho_offsets (i:int) : int = { + proc keccakf1600_rho_offsets (i:int) : int = { var aux:int; var r:int; var x:int; @@ -1421,15 +1329,14 @@ module M(SC:Syscall_t) = { } return r; } - proc nEW_KECCAK__keccakf1600_rhotates (x:int, y:int) : int = { + proc keccakf1600_rhotates (x:int, y:int) : int = { var r:int; var i:int; - i <@ nEW_KECCAK__keccakf1600_index (x, y); - r <@ nEW_KECCAK__keccakf1600_rho_offsets (i); + i <@ keccakf1600_index (x, y); + r <@ keccakf1600_rho_offsets (i); return r; } - proc nEW_KECCAK____keccakf1600_pround_avx2 (state:W256.t Array7.t) : - W256.t Array7.t = { + proc __keccakf1600_pround_avx2 (state:W256.t Array7.t) : W256.t Array7.t = { var c00:W256.t; var c14:W256.t; var t2:W256.t; @@ -1520,24 +1427,20 @@ module M(SC:Syscall_t) = { ((0 %% (2 ^ 1)) + ((2 ^ 1) * ((0 %% (2 ^ 1)) + ((2 ^ 1) * 0)))))))))))))) )); d14 <- (d14 `^` t4); - t3 <- (VPSLLV_4u64 state.[2] nEW_KECCAK__KECCAK_RHOTATES_LEFT.[0]); - state.[2] <- - (VPSRLV_4u64 state.[2] nEW_KECCAK__KECCAK_RHOTATES_RIGHT.[0]); + t3 <- (VPSLLV_4u64 state.[2] kECCAK_RHOTATES_LEFT.[0]); + state.[2] <- (VPSRLV_4u64 state.[2] kECCAK_RHOTATES_RIGHT.[0]); state.[2] <- (state.[2] `|` t3); state.[3] <- (state.[3] `^` d14); - t4 <- (VPSLLV_4u64 state.[3] nEW_KECCAK__KECCAK_RHOTATES_LEFT.[2]); - state.[3] <- - (VPSRLV_4u64 state.[3] nEW_KECCAK__KECCAK_RHOTATES_RIGHT.[2]); + t4 <- (VPSLLV_4u64 state.[3] kECCAK_RHOTATES_LEFT.[2]); + state.[3] <- (VPSRLV_4u64 state.[3] kECCAK_RHOTATES_RIGHT.[2]); state.[3] <- (state.[3] `|` t4); state.[4] <- (state.[4] `^` d14); - t5 <- (VPSLLV_4u64 state.[4] nEW_KECCAK__KECCAK_RHOTATES_LEFT.[3]); - state.[4] <- - (VPSRLV_4u64 state.[4] nEW_KECCAK__KECCAK_RHOTATES_RIGHT.[3]); + t5 <- (VPSLLV_4u64 state.[4] kECCAK_RHOTATES_LEFT.[3]); + state.[4] <- (VPSRLV_4u64 state.[4] kECCAK_RHOTATES_RIGHT.[3]); state.[4] <- (state.[4] `|` t5); state.[5] <- (state.[5] `^` d14); - t6 <- (VPSLLV_4u64 state.[5] nEW_KECCAK__KECCAK_RHOTATES_LEFT.[4]); - state.[5] <- - (VPSRLV_4u64 state.[5] nEW_KECCAK__KECCAK_RHOTATES_RIGHT.[4]); + t6 <- (VPSLLV_4u64 state.[5] kECCAK_RHOTATES_LEFT.[4]); + state.[5] <- (VPSRLV_4u64 state.[5] kECCAK_RHOTATES_RIGHT.[4]); state.[5] <- (state.[5] `|` t6); state.[6] <- (state.[6] `^` d14); t3 <- @@ -1552,8 +1455,8 @@ module M(SC:Syscall_t) = { ((1 %% (2 ^ 2)) + ((2 ^ 2) * ((3 %% (2 ^ 2)) + ((2 ^ 2) * ((0 %% (2 ^ 2)) + ((2 ^ 2) * 2)))))))); - t7 <- (VPSLLV_4u64 state.[6] nEW_KECCAK__KECCAK_RHOTATES_LEFT.[5]); - t1 <- (VPSRLV_4u64 state.[6] nEW_KECCAK__KECCAK_RHOTATES_RIGHT.[5]); + t7 <- (VPSLLV_4u64 state.[6] kECCAK_RHOTATES_LEFT.[5]); + t1 <- (VPSRLV_4u64 state.[6] kECCAK_RHOTATES_RIGHT.[5]); t1 <- (t1 `|` t7); state.[1] <- (state.[1] `^` d14); t5 <- @@ -1568,8 +1471,8 @@ module M(SC:Syscall_t) = { ((2 %% (2 ^ 2)) + ((2 ^ 2) * ((0 %% (2 ^ 2)) + ((2 ^ 2) * ((3 %% (2 ^ 2)) + ((2 ^ 2) * 1)))))))); - t8 <- (VPSLLV_4u64 state.[1] nEW_KECCAK__KECCAK_RHOTATES_LEFT.[1]); - t2 <- (VPSRLV_4u64 state.[1] nEW_KECCAK__KECCAK_RHOTATES_RIGHT.[1]); + t8 <- (VPSLLV_4u64 state.[1] kECCAK_RHOTATES_LEFT.[1]); + t2 <- (VPSRLV_4u64 state.[1] kECCAK_RHOTATES_RIGHT.[1]); t2 <- (t2 `|` t8); t7 <- (VPSRLDQ_256 t1 (W8.of_int 8)); t0 <- ((invw t1) `&` t7); @@ -2104,29 +2007,27 @@ module M(SC:Syscall_t) = { state.[4] <- (state.[4] `^` t4); return state; } - proc nEW_KECCAK___keccakf1600_avx2 (state:W256.t Array7.t) : W256.t Array7.t = { + proc _keccakf1600_avx2 (state:W256.t Array7.t) : W256.t Array7.t = { var round_constants:W64.t Array24.t; var r:W64.t; var rc:W256.t; round_constants <- witness; - round_constants <- nEW_KECCAK__KECCAK1600_RC; + round_constants <- kECCAK1600_RC; r <- (W64.of_int 0); - state <@ nEW_KECCAK____keccakf1600_pround_avx2 (state); + state <@ __keccakf1600_pround_avx2 (state); rc <- (VPBROADCAST_4u64 round_constants.[(W64.to_uint r)]); state.[0] <- (state.[0] `^` rc); r <- (r + (W64.of_int 1)); while ((r \ult (W64.of_int 24))) { - state <@ nEW_KECCAK____keccakf1600_pround_avx2 (state); + state <@ __keccakf1600_pround_avx2 (state); rc <- (VPBROADCAST_4u64 round_constants.[(W64.to_uint r)]); state.[0] <- (state.[0] `^` rc); r <- (r + (W64.of_int 1)); } return state; } - proc nEW_KECCAK____mread_subu64 (buf:W64.t, lEN:int, tRAIL:int) : W64.t * - int * - int * - W64.t = { + proc __mread_subu64 (buf:W64.t, lEN:int, tRAIL:int) : W64.t * int * int * + W64.t = { var w:W64.t; var iLEN:int; var t16:W64.t; @@ -2186,8 +2087,8 @@ module M(SC:Syscall_t) = { } return (buf, lEN, tRAIL, w); } - proc nEW_KECCAK____mread_bcast_4subu64 (buf:W64.t, lEN:int, tRAIL:int) : - W64.t * int * int * W256.t = { + proc __mread_bcast_4subu64 (buf:W64.t, lEN:int, tRAIL:int) : W64.t * int * + int * W256.t = { var w:W256.t; var t64:W64.t; var t128:W128.t; @@ -2201,16 +2102,15 @@ module M(SC:Syscall_t) = { buf <- (buf + (W64.of_int 8)); lEN <- (lEN - 8); } else { - (buf, lEN, tRAIL, t64) <@ nEW_KECCAK____mread_subu64 (buf, lEN, - tRAIL); + (buf, lEN, tRAIL, t64) <@ __mread_subu64 (buf, lEN, tRAIL); t128 <- (zeroextu128 t64); w <- (VPBROADCAST_4u64 (truncateu64 t128)); } } return (buf, lEN, tRAIL, w); } - proc nEW_KECCAK____mread_subu128 (buf:W64.t, lEN:int, tRAIL:int) : - W64.t * int * int * W128.t = { + proc __mread_subu128 (buf:W64.t, lEN:int, tRAIL:int) : W64.t * int * int * + W128.t = { var w:W128.t; var t64:W64.t; if (((lEN <= 0) /\ (tRAIL = 0))) { @@ -2226,20 +2126,18 @@ module M(SC:Syscall_t) = { (VMOV_64 (loadW64 Glob.mem (W64.to_uint (buf + (W64.of_int 0))))); buf <- (buf + (W64.of_int 8)); lEN <- (lEN - 8); - (buf, lEN, tRAIL, t64) <@ nEW_KECCAK____mread_subu64 (buf, - lEN, tRAIL); + (buf, lEN, tRAIL, t64) <@ __mread_subu64 (buf, lEN, tRAIL); w <- (VPINSR_2u64 w t64 (W8.of_int 1)); } else { - (buf, lEN, tRAIL, t64) <@ nEW_KECCAK____mread_subu64 (buf, - lEN, tRAIL); + (buf, lEN, tRAIL, t64) <@ __mread_subu64 (buf, lEN, tRAIL); w <- (zeroextu128 t64); } } } return (buf, lEN, tRAIL, w); } - proc nEW_KECCAK____mread_subu256 (buf:W64.t, lEN:int, tRAIL:int) : - W64.t * int * int * W256.t = { + proc __mread_subu256 (buf:W64.t, lEN:int, tRAIL:int) : W64.t * int * int * + W256.t = { var w:W256.t; var t128_1:W128.t; var t128_0:W128.t; @@ -2255,16 +2153,14 @@ module M(SC:Syscall_t) = { t128_0 <- (loadW128 Glob.mem (W64.to_uint (buf + (W64.of_int 0)))); buf <- (buf + (W64.of_int 16)); lEN <- (lEN - 16); - (buf, lEN, tRAIL, t128_1) <@ nEW_KECCAK____mread_subu128 (buf, - lEN, tRAIL); + (buf, lEN, tRAIL, t128_1) <@ __mread_subu128 (buf, lEN, tRAIL); w <- (W256.of_int (((W128.to_uint t128_0) %% (2 ^ 128)) + ((2 ^ 128) * (W128.to_uint t128_1)))); } else { t128_1 <- (set0_128); - (buf, lEN, tRAIL, t128_0) <@ nEW_KECCAK____mread_subu128 (buf, - lEN, tRAIL); + (buf, lEN, tRAIL, t128_0) <@ __mread_subu128 (buf, lEN, tRAIL); w <- (W256.of_int (((W128.to_uint t128_0) %% (2 ^ 128)) + @@ -2274,8 +2170,7 @@ module M(SC:Syscall_t) = { } return (buf, lEN, tRAIL, w); } - proc nEW_KECCAK____mwrite_subu64 (buf:W64.t, lEN:int, w:W64.t) : W64.t * - int = { + proc __mwrite_subu64 (buf:W64.t, lEN:int, w:W64.t) : W64.t * int = { if ((0 < lEN)) { if ((8 <= lEN)) { @@ -2319,8 +2214,7 @@ module M(SC:Syscall_t) = { } return (buf, lEN); } - proc nEW_KECCAK____mwrite_subu128 (buf:W64.t, lEN:int, w:W128.t) : - W64.t * int = { + proc __mwrite_subu128 (buf:W64.t, lEN:int, w:W128.t) : W64.t * int = { var t64:W64.t; if ((0 < lEN)) { if ((16 <= lEN)) { @@ -2340,15 +2234,14 @@ module M(SC:Syscall_t) = { } t64 <- (truncateu64 w); - (buf, lEN) <@ nEW_KECCAK____mwrite_subu64 (buf, lEN, t64); + (buf, lEN) <@ __mwrite_subu64 (buf, lEN, t64); } } else { } return (buf, lEN); } - proc nEW_KECCAK____mwrite_subu256 (buf:W64.t, lEN:int, w:W256.t) : - W64.t * int = { + proc __mwrite_subu256 (buf:W64.t, lEN:int, w:W256.t) : W64.t * int = { var t128:W128.t; if ((0 < lEN)) { if ((32 <= lEN)) { @@ -2367,14 +2260,14 @@ module M(SC:Syscall_t) = { } else { t128 <- (truncateu128 w); } - (buf, lEN) <@ nEW_KECCAK____mwrite_subu128 (buf, lEN, t128); + (buf, lEN) <@ __mwrite_subu128 (buf, lEN, t128); } } else { } return (buf, lEN); } - proc nEW_KECCAK____state_init_avx2 () : W256.t Array7.t = { + proc __state_init_avx2 () : W256.t Array7.t = { var aux:int; var st:W256.t Array7.t; var i:int; @@ -2386,7 +2279,7 @@ module M(SC:Syscall_t) = { } return st; } - proc nEW_KECCAK____pstate_init_avx2 (pst:W64.t Array25.t) : W64.t Array25.t = { + proc __pstate_init_avx2 (pst:W64.t Array25.t) : W64.t Array25.t = { var aux:int; var z256:W256.t; var i:int; @@ -2405,9 +2298,8 @@ module M(SC:Syscall_t) = { pst.[24] <- z64; return pst; } - proc nEW_KECCAK____perm_reg3456_avx2 (r3:W256.t, r4:W256.t, r5:W256.t, - r6:W256.t) : W256.t * W256.t * - W256.t * W256.t = { + proc __perm_reg3456_avx2 (r3:W256.t, r4:W256.t, r5:W256.t, r6:W256.t) : + W256.t * W256.t * W256.t * W256.t = { var st3:W256.t; var st4:W256.t; var st5:W256.t; @@ -2537,7 +2429,7 @@ module M(SC:Syscall_t) = { )); return (st3, st4, st5, st6); } - proc nEW_KECCAK____state_from_pstate_avx2 (pst:W64.t Array25.t) : W256.t Array7.t = { + proc __state_from_pstate_avx2 (pst:W64.t Array25.t) : W256.t Array7.t = { var aux_2:W256.t; var aux_1:W256.t; var aux_0:W256.t; @@ -2566,27 +2458,25 @@ module M(SC:Syscall_t) = { (((W128.to_uint t128_1) %% (2 ^ 128)) + ((2 ^ 128) * (W128.to_uint t128_0)))); st.[6] <- (get256_direct (WArray200.init64 (fun i => pst.[i])) (21 * 8)); - (aux_2, aux_1, aux_0, aux) <@ nEW_KECCAK____perm_reg3456_avx2 (st.[3], - st.[4], st.[5], st.[6]); + (aux_2, aux_1, aux_0, aux) <@ __perm_reg3456_avx2 (st.[3], st.[4], + st.[5], st.[6]); st.[3] <- aux_2; st.[4] <- aux_1; st.[5] <- aux_0; st.[6] <- aux; return st; } - proc nEW_KECCAK____addstate_r3456 (st:W256.t Array7.t, r3:W256.t, - r4:W256.t, r5:W256.t, r6:W256.t) : - W256.t Array7.t = { + proc __addstate_r3456 (st:W256.t Array7.t, r3:W256.t, r4:W256.t, r5:W256.t, + r6:W256.t) : W256.t Array7.t = { - (r3, r4, r5, r6) <@ nEW_KECCAK____perm_reg3456_avx2 (r3, r4, r5, r6); + (r3, r4, r5, r6) <@ __perm_reg3456_avx2 (r3, r4, r5, r6); st.[3] <- (st.[3] `^` r3); st.[4] <- (st.[4] `^` r4); st.[5] <- (st.[5] `^` r5); st.[6] <- (st.[6] `^` r6); return st; } - proc nEW_KECCAK____addpst01 (st:W256.t Array7.t, pst:W64.t Array25.t) : - W256.t Array7.t = { + proc __addpst01 (st:W256.t Array7.t, pst:W64.t Array25.t) : W256.t Array7.t = { var t256:W256.t; t256 <- (VPBROADCAST_4u64 (get64_direct (WArray200.init64 (fun i => pst.[i])) 0)); @@ -2595,8 +2485,7 @@ module M(SC:Syscall_t) = { st.[1] <- (st.[1] `^` t256); return st; } - proc nEW_KECCAK____addpst23456 (st:W256.t Array7.t, pst:W64.t Array25.t) : - W256.t Array7.t = { + proc __addpst23456 (st:W256.t Array7.t, pst:W64.t Array25.t) : W256.t Array7.t = { var t128_0:W128.t; var r3:W256.t; var t128_1:W128.t; @@ -2622,17 +2511,16 @@ module M(SC:Syscall_t) = { ((2 ^ 128) * (W128.to_uint t128_0)))); st.[2] <- (st.[2] `^` r2); r6 <- (get256_direct (WArray200.init64 (fun i => pst.[i])) (21 * 8)); - st <@ nEW_KECCAK____addstate_r3456 (st, r3, r4, r5, r6); + st <@ __addstate_r3456 (st, r3, r4, r5, r6); return st; } - proc nEW_KECCAK___addpstate_avx2 (st:W256.t Array7.t, pst:W64.t Array25.t) : - W256.t Array7.t = { + proc _addpstate_avx2 (st:W256.t Array7.t, pst:W64.t Array25.t) : W256.t Array7.t = { - st <@ nEW_KECCAK____addpst01 (st, pst); - st <@ nEW_KECCAK____addpst23456 (st, pst); + st <@ __addpst01 (st, pst); + st <@ __addpst23456 (st, pst); return st; } - proc nEW_KECCAK____stavx2_pos (pOS:int) : int * int = { + proc __stavx2_pos (pOS:int) : int * int = { var r:int; var l:int; r <- 0; @@ -2749,7 +2637,7 @@ module M(SC:Syscall_t) = { } return (r, l); } - proc nEW_KECCAK____u64_to_u256 (x:W64.t, l:int) : W256.t = { + proc __u64_to_u256 (x:W64.t, l:int) : W256.t = { var t256:W256.t; var t128:W128.t; if (((l %% 2) = 0)) { @@ -2766,26 +2654,24 @@ module M(SC:Syscall_t) = { } return t256; } - proc nEW_KECCAK____addratebit_avx2 (st:W256.t Array7.t, rATE8:int) : - W256.t Array7.t = { + proc __addratebit_avx2 (st:W256.t Array7.t, rATE8:int) : W256.t Array7.t = { var t64:W64.t; var r:int; var l:int; var t256:W256.t; t64 <- (W64.of_int 1); t64 <- (t64 `<<` (W8.of_int (((8 * rATE8) - 1) %% 64))); - (r, l) <@ nEW_KECCAK____stavx2_pos (((rATE8 - 1) %/ 8)); + (r, l) <@ __stavx2_pos (((rATE8 - 1) %/ 8)); if ((r = 0)) { t256 <- (VPBROADCAST_4u64 t64); } else { - t256 <@ nEW_KECCAK____u64_to_u256 (t64, l); + t256 <@ __u64_to_u256 (t64, l); } st.[r] <- (st.[r] `^` t256); return st; } - proc nEW_KECCAK____addstate_imem_avx2 (st:W256.t Array7.t, buf:W64.t, - lEN:int, tRAILB:int) : W256.t Array7.t * - W64.t = { + proc __addstate_imem_avx2 (st:W256.t Array7.t, buf:W64.t, lEN:int, + tRAILB:int) : W256.t Array7.t * W64.t = { var r0:W256.t; var r1:W256.t; var t64:W64.t; @@ -2796,46 +2682,36 @@ module M(SC:Syscall_t) = { var r5:W256.t; var r2:W256.t; var r6:W256.t; - (buf, lEN, tRAILB, r0) <@ nEW_KECCAK____mread_bcast_4subu64 (buf, - lEN, tRAILB); + (buf, lEN, tRAILB, r0) <@ __mread_bcast_4subu64 (buf, lEN, tRAILB); st.[0] <- (st.[0] `^` r0); - (buf, lEN, tRAILB, r1) <@ nEW_KECCAK____mread_subu256 (buf, lEN, tRAILB); + (buf, lEN, tRAILB, r1) <@ __mread_subu256 (buf, lEN, tRAILB); st.[1] <- (st.[1] `^` r1); if ((0 < lEN)) { - (buf, lEN, tRAILB, t64) <@ nEW_KECCAK____mread_subu64 (buf, lEN, - tRAILB); + (buf, lEN, tRAILB, t64) <@ __mread_subu64 (buf, lEN, tRAILB); t128_1 <- (zeroextu128 t64); - (buf, lEN, tRAILB, r3) <@ nEW_KECCAK____mread_subu256 (buf, lEN, - tRAILB); - (buf, lEN, tRAILB, t64) <@ nEW_KECCAK____mread_subu64 (buf, lEN, - tRAILB); + (buf, lEN, tRAILB, r3) <@ __mread_subu256 (buf, lEN, tRAILB); + (buf, lEN, tRAILB, t64) <@ __mread_subu64 (buf, lEN, tRAILB); t128_0 <- (zeroextu128 t64); - (buf, lEN, tRAILB, r4) <@ nEW_KECCAK____mread_subu256 (buf, lEN, - tRAILB); - (buf, lEN, tRAILB, t64) <@ nEW_KECCAK____mread_subu64 (buf, lEN, - tRAILB); + (buf, lEN, tRAILB, r4) <@ __mread_subu256 (buf, lEN, tRAILB); + (buf, lEN, tRAILB, t64) <@ __mread_subu64 (buf, lEN, tRAILB); t128_1 <- (VPINSR_2u64 t128_1 t64 (W8.of_int 1)); - (buf, lEN, tRAILB, r5) <@ nEW_KECCAK____mread_subu256 (buf, lEN, - tRAILB); - (buf, lEN, tRAILB, t64) <@ nEW_KECCAK____mread_subu64 (buf, lEN, - tRAILB); + (buf, lEN, tRAILB, r5) <@ __mread_subu256 (buf, lEN, tRAILB); + (buf, lEN, tRAILB, t64) <@ __mread_subu64 (buf, lEN, tRAILB); t128_0 <- (VPINSR_2u64 t128_0 t64 (W8.of_int 1)); r2 <- (W256.of_int (((W128.to_uint t128_0) %% (2 ^ 128)) + ((2 ^ 128) * (W128.to_uint t128_1)))); st.[2] <- (st.[2] `^` r2); - (buf, lEN, tRAILB, r6) <@ nEW_KECCAK____mread_subu256 (buf, lEN, - tRAILB); - st <@ nEW_KECCAK____addstate_r3456 (st, r3, r4, r5, r6); + (buf, lEN, tRAILB, r6) <@ __mread_subu256 (buf, lEN, tRAILB); + st <@ __addstate_r3456 (st, r3, r4, r5, r6); } else { } return (st, buf); } - proc nEW_KECCAK____absorb_imem_avx2 (st:W256.t Array7.t, buf:W64.t, - lEN:int, rATE8:int, tRAILB:int) : - W256.t Array7.t * W64.t = { + proc __absorb_imem_avx2 (st:W256.t Array7.t, buf:W64.t, lEN:int, rATE8:int, + tRAILB:int) : W256.t Array7.t * W64.t = { var aLL:int; var iTERS:int; var i:W64.t; @@ -2844,25 +2720,24 @@ module M(SC:Syscall_t) = { if ((0 < iTERS)) { i <- (W64.of_int 0); while ((i \ult (W64.of_int iTERS))) { - (st, buf) <@ nEW_KECCAK____addstate_imem_avx2 (st, buf, rATE8, 0); - st <@ nEW_KECCAK___keccakf1600_avx2 (st); + (st, buf) <@ __addstate_imem_avx2 (st, buf, rATE8, 0); + st <@ _keccakf1600_avx2 (st); i <- (i + (W64.of_int 1)); } } else { } lEN <- (lEN %% rATE8); - (st, buf) <@ nEW_KECCAK____addstate_imem_avx2 (st, buf, lEN, tRAILB); + (st, buf) <@ __addstate_imem_avx2 (st, buf, lEN, tRAILB); if ((tRAILB <> 0)) { - st <@ nEW_KECCAK____addratebit_avx2 (st, rATE8); + st <@ __addratebit_avx2 (st, rATE8); } else { } return (st, buf); } - proc nEW_KECCAK____pstate_imem_avx2 (pst:W64.t Array25.t, aT:int, - buf:W64.t, lEN:int, tRAILB:int) : - W64.t Array25.t * int * W64.t = { + proc __pstate_imem_avx2 (pst:W64.t Array25.t, aT:int, buf:W64.t, lEN:int, + tRAILB:int) : W64.t Array25.t * int * W64.t = { var aLL:int; var lO:int; var at:W64.t; @@ -2883,8 +2758,7 @@ module M(SC:Syscall_t) = { } else { } - (buf, _2, tRAILB, t64) <@ nEW_KECCAK____mread_subu64 (buf, lEN, - tRAILB); + (buf, _2, tRAILB, t64) <@ __mread_subu64 (buf, lEN, tRAILB); t64 <- (t64 `<<` (W8.of_int (8 * lO))); pst.[(W64.to_uint at)] <- (pst.[(W64.to_uint at)] `^` t64); lO <- 0; @@ -2895,8 +2769,7 @@ module M(SC:Syscall_t) = { t64 <- (loadW64 Glob.mem (W64.to_uint (buf + (W64.of_int 0)))); buf <- (buf + (W64.of_int (8 - lO))); } else { - (buf, _0, _1, t64) <@ nEW_KECCAK____mread_subu64 (buf, (8 - lO), - 0); + (buf, _0, _1, t64) <@ __mread_subu64 (buf, (8 - lO), 0); } lEN <- (lEN - (8 - lO)); aT <- (aT + (8 - lO)); @@ -2955,17 +2828,15 @@ module M(SC:Syscall_t) = { } else { } - (buf, _3, tRAILB, t64) <@ nEW_KECCAK____mread_subu64 (buf, lO, - tRAILB); + (buf, _3, tRAILB, t64) <@ __mread_subu64 (buf, lO, tRAILB); pst.[(aLL %/ 8)] <- t64; } else { } return (pst, aLL, buf); } - proc nEW_KECCAK____pabsorb_imem_avx2 (pst:W64.t Array25.t, aT:int, - st:W256.t Array7.t, buf:W64.t, - lEN:int, rATE8:int, tRAILB:int) : + proc __pabsorb_imem_avx2 (pst:W64.t Array25.t, aT:int, st:W256.t Array7.t, + buf:W64.t, lEN:int, rATE8:int, tRAILB:int) : W64.t Array25.t * int * W256.t Array7.t * W64.t = { var aLL:int; var iTERS:int; @@ -2973,8 +2844,7 @@ module M(SC:Syscall_t) = { var _0:int; aLL <- (aT + lEN); if (((aT + lEN) < rATE8)) { - (pst, aT, buf) <@ nEW_KECCAK____pstate_imem_avx2 (pst, aT, buf, - lEN, tRAILB); + (pst, aT, buf) <@ __pstate_imem_avx2 (pst, aT, buf, lEN, tRAILB); if ((tRAILB <> 0)) { i <- (W64.of_int ((aT %/ 8) + 1)); if ((aT <= (5 * 8))) { @@ -2982,8 +2852,8 @@ module M(SC:Syscall_t) = { pst.[(W64.to_uint i)] <- (W64.of_int 0); i <- (i + (W64.of_int 1)); } - st <@ nEW_KECCAK____addpst01 (st, pst); - st <@ nEW_KECCAK____addratebit_avx2 (st, rATE8); + st <@ __addpst01 (st, pst); + st <@ __addratebit_avx2 (st, rATE8); } else { while ((i \ult (W64.of_int (rATE8 %/ 8)))) { pst.[(W64.to_uint i)] <- (W64.of_int 0); @@ -2996,18 +2866,18 @@ module M(SC:Syscall_t) = { (rATE8 - 1) ((get8 (WArray200.init64 (fun i_0 => pst.[i_0])) (rATE8 - 1)) `^` (W8.of_int 128))))); - st <@ nEW_KECCAK___addpstate_avx2 (st, pst); + st <@ _addpstate_avx2 (st, pst); } } else { } } else { if ((aT <> 0)) { - (pst, _0, buf) <@ nEW_KECCAK____pstate_imem_avx2 (pst, aT, buf, - (rATE8 - aT), 0); + (pst, _0, buf) <@ __pstate_imem_avx2 (pst, aT, buf, (rATE8 - aT), + 0); lEN <- (lEN - (rATE8 - aT)); - st <@ nEW_KECCAK___addpstate_avx2 (st, pst); - st <@ nEW_KECCAK___keccakf1600_avx2 (st); + st <@ _addpstate_avx2 (st, pst); + st <@ _keccakf1600_avx2 (st); aT <- 0; } else { @@ -3015,19 +2885,18 @@ module M(SC:Syscall_t) = { iTERS <- (lEN %/ rATE8); i <- (W64.of_int 0); while ((i \ult (W64.of_int iTERS))) { - (st, buf) <@ nEW_KECCAK____addstate_imem_avx2 (st, buf, rATE8, 0); - st <@ nEW_KECCAK___keccakf1600_avx2 (st); + (st, buf) <@ __addstate_imem_avx2 (st, buf, rATE8, 0); + st <@ _keccakf1600_avx2 (st); i <- (i + (W64.of_int 1)); } lEN <- (aLL %% rATE8); if ((tRAILB <> 0)) { - (st, buf) <@ nEW_KECCAK____addstate_imem_avx2 (st, buf, lEN, tRAILB); - st <@ nEW_KECCAK____addratebit_avx2 (st, rATE8); + (st, buf) <@ __addstate_imem_avx2 (st, buf, lEN, tRAILB); + st <@ __addratebit_avx2 (st, rATE8); aT <- 0; } else { if ((lEN <> 0)) { - (pst, aT, buf) <@ nEW_KECCAK____pstate_imem_avx2 (pst, 0, buf, - lEN, tRAILB); + (pst, aT, buf) <@ __pstate_imem_avx2 (pst, 0, buf, lEN, tRAILB); } else { } @@ -3035,8 +2904,7 @@ module M(SC:Syscall_t) = { } return (pst, aT, st, buf); } - proc nEW_KECCAK____dumpstate_imem_avx2 (buf:W64.t, lEN:int, - st:W256.t Array7.t) : W64.t = { + proc __dumpstate_imem_avx2 (buf:W64.t, lEN:int, st:W256.t Array7.t) : W64.t = { var t128_0:W128.t; var t128_1:W128.t; var t:W64.t; @@ -3047,17 +2915,17 @@ module M(SC:Syscall_t) = { var t256_4:W256.t; var _0:int; if ((8 <= lEN)) { - (buf, _0) <@ nEW_KECCAK____mwrite_subu256 (buf, 8, st.[0]); + (buf, _0) <@ __mwrite_subu256 (buf, 8, st.[0]); lEN <- (lEN - 8); } else { - (buf, lEN) <@ nEW_KECCAK____mwrite_subu256 (buf, lEN, st.[0]); + (buf, lEN) <@ __mwrite_subu256 (buf, lEN, st.[0]); } - (buf, lEN) <@ nEW_KECCAK____mwrite_subu256 (buf, lEN, st.[1]); + (buf, lEN) <@ __mwrite_subu256 (buf, lEN, st.[1]); if ((0 < lEN)) { t128_0 <- (truncateu128 st.[2]); t128_1 <- (VEXTRACTI128 st.[2] (W8.of_int 1)); t <- (truncateu64 t128_1); - (buf, lEN) <@ nEW_KECCAK____mwrite_subu64 (buf, lEN, t); + (buf, lEN) <@ __mwrite_subu64 (buf, lEN, t); t128_1 <- (VPUNPCKH_2u64 t128_1 t128_1); } else { @@ -3139,13 +3007,13 @@ module M(SC:Syscall_t) = { ((2 ^ 1) * ((0 %% (2 ^ 1)) + ((2 ^ 1) * ((1 %% (2 ^ 1)) + ((2 ^ 1) * 1)))))))))))))) )); - (buf, lEN) <@ nEW_KECCAK____mwrite_subu256 (buf, lEN, t256_4); + (buf, lEN) <@ __mwrite_subu256 (buf, lEN, t256_4); } else { } if ((0 < lEN)) { t <- (truncateu64 t128_0); - (buf, lEN) <@ nEW_KECCAK____mwrite_subu64 (buf, lEN, t); + (buf, lEN) <@ __mwrite_subu64 (buf, lEN, t); t128_0 <- (VPUNPCKH_2u64 t128_0 t128_0); } else { @@ -3166,13 +3034,13 @@ module M(SC:Syscall_t) = { ((2 ^ 1) * ((0 %% (2 ^ 1)) + ((2 ^ 1) * ((1 %% (2 ^ 1)) + ((2 ^ 1) * 1)))))))))))))) )); - (buf, lEN) <@ nEW_KECCAK____mwrite_subu256 (buf, lEN, t256_4); + (buf, lEN) <@ __mwrite_subu256 (buf, lEN, t256_4); } else { } if ((0 < lEN)) { t <- (truncateu64 t128_1); - (buf, lEN) <@ nEW_KECCAK____mwrite_subu64 (buf, lEN, t); + (buf, lEN) <@ __mwrite_subu64 (buf, lEN, t); } else { } @@ -3192,13 +3060,13 @@ module M(SC:Syscall_t) = { ((2 ^ 1) * ((0 %% (2 ^ 1)) + ((2 ^ 1) * ((1 %% (2 ^ 1)) + ((2 ^ 1) * 1)))))))))))))) )); - (buf, lEN) <@ nEW_KECCAK____mwrite_subu256 (buf, lEN, t256_4); + (buf, lEN) <@ __mwrite_subu256 (buf, lEN, t256_4); } else { } if ((0 < lEN)) { t <- (truncateu64 t128_0); - (buf, lEN) <@ nEW_KECCAK____mwrite_subu64 (buf, lEN, t); + (buf, lEN) <@ __mwrite_subu64 (buf, lEN, t); } else { } @@ -3218,7 +3086,7 @@ module M(SC:Syscall_t) = { ((2 ^ 1) * ((0 %% (2 ^ 1)) + ((2 ^ 1) * ((1 %% (2 ^ 1)) + ((2 ^ 1) * 1)))))))))))))) )); - (buf, lEN) <@ nEW_KECCAK____mwrite_subu256 (buf, lEN, t256_4); + (buf, lEN) <@ __mwrite_subu256 (buf, lEN, t256_4); } else { } @@ -3227,8 +3095,7 @@ module M(SC:Syscall_t) = { } return buf; } - proc nEW_KECCAK____squeeze_imem_avx2 (buf:W64.t, lEN:int, - st:W256.t Array7.t, rATE8:int) : + proc __squeeze_imem_avx2 (buf:W64.t, lEN:int, st:W256.t Array7.t, rATE8:int) : W64.t * W256.t Array7.t = { var iTERS:int; var lO:int; @@ -3239,16 +3106,16 @@ module M(SC:Syscall_t) = { if ((0 < iTERS)) { i <- (W64.of_int 0); while ((i \ult (W64.of_int iTERS))) { - st <@ nEW_KECCAK___keccakf1600_avx2 (st); - buf <@ nEW_KECCAK____dumpstate_imem_avx2 (buf, rATE8, st); + st <@ _keccakf1600_avx2 (st); + buf <@ __dumpstate_imem_avx2 (buf, rATE8, st); i <- (i + (W64.of_int 1)); } } else { } if ((0 < lO)) { - st <@ nEW_KECCAK___keccakf1600_avx2 (st); - buf <@ nEW_KECCAK____dumpstate_imem_avx2 (buf, lO, st); + st <@ _keccakf1600_avx2 (st); + buf <@ __dumpstate_imem_avx2 (buf, lO, st); } else { } @@ -3257,7 +3124,7 @@ module M(SC:Syscall_t) = { } return (buf, st); } - proc nEW_KECCAK__keccakf1600_4x_theta_sum (a:W256.t Array25.t) : W256.t Array5.t = { + proc keccakf1600_4x_theta_sum (a:W256.t Array25.t) : W256.t Array5.t = { var aux:int; var c:W256.t Array5.t; var x:int; @@ -3279,8 +3146,8 @@ module M(SC:Syscall_t) = { } return c; } - proc nEW_KECCAK__keccakf1600_4x_rol (a:W256.t Array5.t, x:int, r:int, - r8:W256.t, r56:W256.t) : W256.t Array5.t = { + proc keccakf1600_4x_rol (a:W256.t Array5.t, x:int, r:int, r8:W256.t, + r56:W256.t) : W256.t Array5.t = { var t:W256.t; if ((r = 8)) { a.[x] <- (VPSHUFB_256 a.[x] r8); @@ -3295,8 +3162,8 @@ module M(SC:Syscall_t) = { } return a; } - proc nEW_KECCAK__keccakf1600_4x_theta_rol (c:W256.t Array5.t, r8:W256.t, - r56:W256.t) : W256.t Array5.t = { + proc keccakf1600_4x_theta_rol (c:W256.t Array5.t, r8:W256.t, r56:W256.t) : + W256.t Array5.t = { var aux:int; var d:W256.t Array5.t; var x:int; @@ -3304,15 +3171,14 @@ module M(SC:Syscall_t) = { x <- 0; while ((x < 5)) { d.[x] <- c.[((x + 1) %% 5)]; - d <@ nEW_KECCAK__keccakf1600_4x_rol (d, x, 1, r8, r56); + d <@ keccakf1600_4x_rol (d, x, 1, r8, r56); d.[x] <- (d.[x] `^` c.[(((x - 1) + 5) %% 5)]); x <- (x + 1); } return d; } - proc nEW_KECCAK__keccakf1600_4x_rol_sum (a:W256.t Array25.t, - d:W256.t Array5.t, y:int, - r8:W256.t, r56:W256.t) : W256.t Array5.t = { + proc keccakf1600_4x_rol_sum (a:W256.t Array25.t, d:W256.t Array5.t, y:int, + r8:W256.t, r56:W256.t) : W256.t Array5.t = { var aux:int; var b:W256.t Array5.t; var x:int; @@ -3324,11 +3190,11 @@ module M(SC:Syscall_t) = { while ((x < 5)) { x_ <- ((x + (3 * y)) %% 5); y_ <- x; - r <@ nEW_KECCAK__keccakf1600_rhotates (x_, y_); + r <@ keccakf1600_rhotates (x_, y_); b.[x] <- a.[(x_ + (y_ * 5))]; b.[x] <- (b.[x] `^` d.[x_]); if ((r <> 0)) { - b <@ nEW_KECCAK__keccakf1600_4x_rol (b, x, r, r8, r56); + b <@ keccakf1600_4x_rol (b, x, r, r8, r56); } else { } @@ -3336,9 +3202,8 @@ module M(SC:Syscall_t) = { } return b; } - proc nEW_KECCAK__keccakf1600_4x_set_row (e:W256.t Array25.t, - b:W256.t Array5.t, y:int, - rc:W256.t) : W256.t Array25.t = { + proc keccakf1600_4x_set_row (e:W256.t Array25.t, b:W256.t Array5.t, y:int, + rc:W256.t) : W256.t Array25.t = { var aux:int; var x:int; var x1:int; @@ -3360,9 +3225,8 @@ module M(SC:Syscall_t) = { } return e; } - proc nEW_KECCAK___keccakf1600_4x_round (e:W256.t Array25.t, - a:W256.t Array25.t, rc:W256.t, - r8:W256.t, r56:W256.t) : W256.t Array25.t = { + proc _keccakf1600_4x_round (e:W256.t Array25.t, a:W256.t Array25.t, + rc:W256.t, r8:W256.t, r56:W256.t) : W256.t Array25.t = { var aux:int; var c:W256.t Array5.t; var d:W256.t Array5.t; @@ -3371,17 +3235,17 @@ module M(SC:Syscall_t) = { b <- witness; c <- witness; d <- witness; - c <@ nEW_KECCAK__keccakf1600_4x_theta_sum (a); - d <@ nEW_KECCAK__keccakf1600_4x_theta_rol (c, r8, r56); + c <@ keccakf1600_4x_theta_sum (a); + d <@ keccakf1600_4x_theta_rol (c, r8, r56); y <- 0; while ((y < 5)) { - b <@ nEW_KECCAK__keccakf1600_4x_rol_sum (a, d, y, r8, r56); - e <@ nEW_KECCAK__keccakf1600_4x_set_row (e, b, y, rc); + b <@ keccakf1600_4x_rol_sum (a, d, y, r8, r56); + e <@ keccakf1600_4x_set_row (e, b, y, rc); y <- (y + 1); } return e; } - proc nEW_KECCAK____keccakf1600_avx2x4 (a:W256.t Array25.t) : W256.t Array25.t = { + proc __keccakf1600_avx2x4 (a:W256.t Array25.t) : W256.t Array25.t = { var rC:W64.t Array24.t; var s_e:W256.t Array25.t; var e:W256.t Array25.t; @@ -3394,20 +3258,20 @@ module M(SC:Syscall_t) = { a_s <- witness; e <- witness; s_e <- witness; - rC <- nEW_KECCAK__KECCAK1600_RC; + rC <- kECCAK1600_RC; e <- s_e; - r8 <- nEW_KECCAK__ROL8; - r56 <- nEW_KECCAK__ROL56; + r8 <- rOL8; + r56 <- rOL56; c <- (W64.of_int 0); while ((c \ult (W64.of_int 24))) { rc <- (VPBROADCAST_4u64 rC.[(W64.to_uint c)]); - e <@ nEW_KECCAK___keccakf1600_4x_round (e, a, rc, r8, r56); + e <@ _keccakf1600_4x_round (e, a, rc, r8, r56); a_s <- a; s_e <- e; a <- a_s; e <- s_e; rc <- (VPBROADCAST_4u64 rC.[((W64.to_uint c) + 1)]); - a <@ nEW_KECCAK___keccakf1600_4x_round (a, e, rc, r8, r56); + a <@ _keccakf1600_4x_round (a, e, rc, r8, r56); a_s <- a; s_e <- e; a <- a_s; @@ -3416,12 +3280,12 @@ module M(SC:Syscall_t) = { } return a; } - proc nEW_KECCAK___keccakf1600_avx2x4 (a:W256.t Array25.t) : W256.t Array25.t = { + proc _keccakf1600_avx2x4 (a:W256.t Array25.t) : W256.t Array25.t = { - a <@ nEW_KECCAK____keccakf1600_avx2x4 (a); + a <@ __keccakf1600_avx2x4 (a); return a; } - proc nEW_KECCAK____state_init_avx2x4 (st:W256.t Array25.t) : W256.t Array25.t = { + proc __state_init_avx2x4 (st:W256.t Array25.t) : W256.t Array25.t = { var z256:W256.t; var i:W64.t; z256 <- (set0_256); @@ -3436,8 +3300,7 @@ module M(SC:Syscall_t) = { } return st; } - proc nEW_KECCAK____addratebit_avx2x4 (st:W256.t Array25.t, rATE8:int) : - W256.t Array25.t = { + proc __addratebit_avx2x4 (st:W256.t Array25.t, rATE8:int) : W256.t Array25.t = { var t64:W64.t; var t128:W128.t; var t256:W256.t; @@ -3449,9 +3312,8 @@ module M(SC:Syscall_t) = { st.[((rATE8 - 1) %/ 8)] <- t256; return st; } - proc nEW_KECCAK____4u64x4_u256x4 (y0:W256.t, y1:W256.t, y2:W256.t, - y3:W256.t) : W256.t * W256.t * W256.t * - W256.t = { + proc __4u64x4_u256x4 (y0:W256.t, y1:W256.t, y2:W256.t, y3:W256.t) : + W256.t * W256.t * W256.t * W256.t = { var x0:W256.t; var x1:W256.t; var x2:W256.t; @@ -3466,9 +3328,8 @@ module M(SC:Syscall_t) = { y3 <- (VPUNPCKH_4u64 x2 x3); return (y0, y1, y2, y3); } - proc nEW_KECCAK__A1____aread_subu64 (buf:W8.t Array1.t, offset:W64.t, - dELTA:int, lEN:int, tRAIL:int) : - int * int * int * W64.t = { + proc a1____aread_subu64 (buf:W8.t Array1.t, offset:W64.t, dELTA:int, + lEN:int, tRAIL:int) : int * int * int * W64.t = { var w:W64.t; var iLEN:int; var t16:W64.t; @@ -3533,14 +3394,11 @@ module M(SC:Syscall_t) = { } return (dELTA, lEN, tRAIL, w); } - proc nEW_KECCAK__A1____addstate_array_avx2x4 (st:W256.t Array25.t, aT:int, - buf0:W8.t Array1.t, - buf1:W8.t Array1.t, - buf2:W8.t Array1.t, - buf3:W8.t Array1.t, - offset:W64.t, lEN:int, - tRAILB:int) : W256.t Array25.t * - int * W64.t = { + proc a1____addstate_array_avx2x4 (st:W256.t Array25.t, aT:int, + buf0:W8.t Array1.t, buf1:W8.t Array1.t, + buf2:W8.t Array1.t, buf3:W8.t Array1.t, + offset:W64.t, lEN:int, tRAILB:int) : + W256.t Array25.t * int * W64.t = { var aLL:int; var lO:int; var at:W64.t; @@ -3597,14 +3455,14 @@ module M(SC:Syscall_t) = { } else { } - ( _11, _12, _13, t0) <@ nEW_KECCAK__A1____aread_subu64 (buf0, - offset, dELTA, lEN, tRAILB); - ( _14, _15, _16, t1) <@ nEW_KECCAK__A1____aread_subu64 (buf1, - offset, dELTA, lEN, tRAILB); - ( _17, _18, _19, t2) <@ nEW_KECCAK__A1____aread_subu64 (buf2, - offset, dELTA, lEN, tRAILB); - (dELTA, _20, _21, t3) <@ nEW_KECCAK__A1____aread_subu64 (buf3, - offset, dELTA, lEN, tRAILB); + ( _11, _12, _13, t0) <@ a1____aread_subu64 (buf0, offset, dELTA, + lEN, tRAILB); + ( _14, _15, _16, t1) <@ a1____aread_subu64 (buf1, offset, dELTA, + lEN, tRAILB); + ( _17, _18, _19, t2) <@ a1____aread_subu64 (buf2, offset, dELTA, + lEN, tRAILB); + (dELTA, _20, _21, t3) <@ a1____aread_subu64 (buf3, offset, + dELTA, lEN, tRAILB); t0 <- (t0 `<<` (W8.of_int (8 * lO))); t0 <- (t0 `^` @@ -3665,14 +3523,14 @@ module M(SC:Syscall_t) = { (W64.to_uint (offset + (W64.of_int dELTA)))); offset <- (offset + (W64.of_int (8 - lO))); } else { - ( _0, _1, _2, t0) <@ nEW_KECCAK__A1____aread_subu64 (buf0, - offset, dELTA, (8 - lO), tRAILB); - ( _3, _4, _5, t1) <@ nEW_KECCAK__A1____aread_subu64 (buf1, - offset, dELTA, (8 - lO), tRAILB); - ( _6, _7, _8, t2) <@ nEW_KECCAK__A1____aread_subu64 (buf2, - offset, dELTA, (8 - lO), tRAILB); - (dELTA, _9, _10, t3) <@ nEW_KECCAK__A1____aread_subu64 (buf3, - offset, dELTA, (8 - lO), tRAILB); + ( _0, _1, _2, t0) <@ a1____aread_subu64 (buf0, offset, dELTA, + (8 - lO), tRAILB); + ( _3, _4, _5, t1) <@ a1____aread_subu64 (buf1, offset, dELTA, + (8 - lO), tRAILB); + ( _6, _7, _8, t2) <@ a1____aread_subu64 (buf2, offset, dELTA, + (8 - lO), tRAILB); + (dELTA, _9, _10, t3) <@ a1____aread_subu64 (buf3, offset, + dELTA, (8 - lO), tRAILB); } lEN <- (lEN - (8 - lO)); aT <- (aT + (8 - lO)); @@ -3738,8 +3596,8 @@ module M(SC:Syscall_t) = { (get256_direct (WArray1.init8 (fun i => buf0.[i])) (W64.to_uint offset)); offset <- (offset + (W64.of_int 32)); - (t256_0, t256_1, t256_2, t256_3) <@ nEW_KECCAK____4u64x4_u256x4 ( - t256_0, t256_1, t256_2, t256_3); + (t256_0, t256_1, t256_2, t256_3) <@ __4u64x4_u256x4 (t256_0, + t256_1, t256_2, t256_3); st <- (Array25.init (WArray800.get256 @@ -3820,14 +3678,14 @@ module M(SC:Syscall_t) = { } lO <- ((aT + lEN) %% 8); if (((0 < lO) \/ (tRAILB <> 0))) { - ( _22, _23, _24, t0) <@ nEW_KECCAK__A1____aread_subu64 (buf0, - offset, dELTA, lO, tRAILB); - ( _25, _26, _27, t1) <@ nEW_KECCAK__A1____aread_subu64 (buf1, - offset, dELTA, lO, tRAILB); - ( _28, _29, _30, t2) <@ nEW_KECCAK__A1____aread_subu64 (buf2, - offset, dELTA, lO, tRAILB); - (dELTA, _31, _32, t3) <@ nEW_KECCAK__A1____aread_subu64 (buf3, - offset, dELTA, lO, tRAILB); + ( _22, _23, _24, t0) <@ a1____aread_subu64 (buf0, offset, dELTA, + lO, tRAILB); + ( _25, _26, _27, t1) <@ a1____aread_subu64 (buf1, offset, dELTA, + lO, tRAILB); + ( _28, _29, _30, t2) <@ a1____aread_subu64 (buf2, offset, dELTA, + lO, tRAILB); + (dELTA, _31, _32, t3) <@ a1____aread_subu64 (buf3, offset, dELTA, + lO, tRAILB); offset <- (offset + (W64.of_int dELTA)); if ((tRAILB <> 0)) { aLL <- (aLL + 1); @@ -3876,14 +3734,12 @@ module M(SC:Syscall_t) = { } return (st, aLL, offset); } - proc nEW_KECCAK__A1____absorb_array_avx2x4 (st:W256.t Array25.t, aT:int, - buf0:W8.t Array1.t, - buf1:W8.t Array1.t, - buf2:W8.t Array1.t, - buf3:W8.t Array1.t, - offset:W64.t, lEN:int, - rATE8:int, tRAILB:int) : - W256.t Array25.t * int * W64.t = { + proc a1____absorb_array_avx2x4 (st:W256.t Array25.t, aT:int, + buf0:W8.t Array1.t, buf1:W8.t Array1.t, + buf2:W8.t Array1.t, buf3:W8.t Array1.t, + offset:W64.t, lEN:int, rATE8:int, + tRAILB:int) : W256.t Array25.t * int * + W64.t = { var aLL:int; var iTERS:int; var i:W64.t; @@ -3891,19 +3747,19 @@ module M(SC:Syscall_t) = { var _1:int; aLL <- (aT + lEN); if (((aT + lEN) < rATE8)) { - (st, aT, offset) <@ nEW_KECCAK__A1____addstate_array_avx2x4 (st, - aT, buf0, buf1, buf2, buf3, offset, lEN, tRAILB); + (st, aT, offset) <@ a1____addstate_array_avx2x4 (st, aT, buf0, + buf1, buf2, buf3, offset, lEN, tRAILB); if ((tRAILB <> 0)) { - st <@ nEW_KECCAK____addratebit_avx2x4 (st, rATE8); + st <@ __addratebit_avx2x4 (st, rATE8); } else { } } else { if ((aT <> 0)) { - (st, _0, offset) <@ nEW_KECCAK__A1____addstate_array_avx2x4 ( - st, aT, buf0, buf1, buf2, buf3, offset, (rATE8 - aT), 0); + (st, _0, offset) <@ a1____addstate_array_avx2x4 (st, aT, buf0, + buf1, buf2, buf3, offset, (rATE8 - aT), 0); lEN <- (lEN - (rATE8 - aT)); - st <@ nEW_KECCAK___keccakf1600_avx2x4 (st); + st <@ _keccakf1600_avx2x4 (st); aT <- 0; } else { @@ -3911,25 +3767,24 @@ module M(SC:Syscall_t) = { iTERS <- (lEN %/ rATE8); i <- (W64.of_int 0); while ((i \ult (W64.of_int iTERS))) { - (st, _1, offset) <@ nEW_KECCAK__A1____addstate_array_avx2x4 ( - st, 0, buf0, buf1, buf2, buf3, offset, rATE8, 0); - st <@ nEW_KECCAK___keccakf1600_avx2x4 (st); + (st, _1, offset) <@ a1____addstate_array_avx2x4 (st, 0, buf0, + buf1, buf2, buf3, offset, rATE8, 0); + st <@ _keccakf1600_avx2x4 (st); i <- (i + (W64.of_int 1)); } lEN <- (aLL %% rATE8); - (st, aT, offset) <@ nEW_KECCAK__A1____addstate_array_avx2x4 (st, 0, - buf0, buf1, buf2, buf3, offset, lEN, tRAILB); + (st, aT, offset) <@ a1____addstate_array_avx2x4 (st, 0, buf0, buf1, + buf2, buf3, offset, lEN, tRAILB); if ((tRAILB <> 0)) { - st <@ nEW_KECCAK____addratebit_avx2x4 (st, rATE8); + st <@ __addratebit_avx2x4 (st, rATE8); } else { } } return (st, aT, offset); } - proc nEW_KECCAK__A2____aread_subu64 (buf:W8.t Array2.t, offset:W64.t, - dELTA:int, lEN:int, tRAIL:int) : - int * int * int * W64.t = { + proc a2____aread_subu64 (buf:W8.t Array2.t, offset:W64.t, dELTA:int, + lEN:int, tRAIL:int) : int * int * int * W64.t = { var w:W64.t; var iLEN:int; var t16:W64.t; @@ -3994,9 +3849,8 @@ module M(SC:Syscall_t) = { } return (dELTA, lEN, tRAIL, w); } - proc nEW_KECCAK__A2____aread_subu128 (buf:W8.t Array2.t, offset:W64.t, - dELTA:int, lEN:int, tRAIL:int) : - int * int * int * W128.t = { + proc a2____aread_subu128 (buf:W8.t Array2.t, offset:W64.t, dELTA:int, + lEN:int, tRAIL:int) : int * int * int * W128.t = { var w:W128.t; var t64:W64.t; if (((lEN <= 0) /\ (tRAIL = 0))) { @@ -4016,21 +3870,20 @@ module M(SC:Syscall_t) = { (W64.to_uint (offset + (W64.of_int dELTA))))); dELTA <- (dELTA + 8); lEN <- (lEN - 8); - (dELTA, lEN, tRAIL, t64) <@ nEW_KECCAK__A2____aread_subu64 ( - buf, offset, dELTA, lEN, tRAIL); + (dELTA, lEN, tRAIL, t64) <@ a2____aread_subu64 (buf, offset, + dELTA, lEN, tRAIL); w <- (VPINSR_2u64 w t64 (W8.of_int 1)); } else { - (dELTA, lEN, tRAIL, t64) <@ nEW_KECCAK__A2____aread_subu64 ( - buf, offset, dELTA, lEN, tRAIL); + (dELTA, lEN, tRAIL, t64) <@ a2____aread_subu64 (buf, offset, + dELTA, lEN, tRAIL); w <- (zeroextu128 t64); } } } return (dELTA, lEN, tRAIL, w); } - proc nEW_KECCAK__A2____aread_subu256 (buf:W8.t Array2.t, offset:W64.t, - dELTA:int, lEN:int, tRAIL:int) : - int * int * int * W256.t = { + proc a2____aread_subu256 (buf:W8.t Array2.t, offset:W64.t, dELTA:int, + lEN:int, tRAIL:int) : int * int * int * W256.t = { var w:W256.t; var t128_1:W128.t; var t128_0:W128.t; @@ -4050,16 +3903,16 @@ module M(SC:Syscall_t) = { (W64.to_uint (offset + (W64.of_int dELTA)))); dELTA <- (dELTA + 16); lEN <- (lEN - 16); - (dELTA, lEN, tRAIL, t128_1) <@ nEW_KECCAK__A2____aread_subu128 ( - buf, offset, dELTA, lEN, tRAIL); + (dELTA, lEN, tRAIL, t128_1) <@ a2____aread_subu128 (buf, offset, + dELTA, lEN, tRAIL); w <- (W256.of_int (((W128.to_uint t128_0) %% (2 ^ 128)) + ((2 ^ 128) * (W128.to_uint t128_1)))); } else { t128_1 <- (set0_128); - (dELTA, lEN, tRAIL, t128_0) <@ nEW_KECCAK__A2____aread_subu128 ( - buf, offset, dELTA, lEN, tRAIL); + (dELTA, lEN, tRAIL, t128_0) <@ a2____aread_subu128 (buf, offset, + dELTA, lEN, tRAIL); w <- (W256.of_int (((W128.to_uint t128_0) %% (2 ^ 128)) + @@ -4069,11 +3922,9 @@ module M(SC:Syscall_t) = { } return (dELTA, lEN, tRAIL, w); } - proc nEW_KECCAK__A2____addstate_array_avx2 (st:W256.t Array7.t, - buf:W8.t Array2.t, - offset:W64.t, lEN:int, - tRAILB:int) : W256.t Array7.t * - W64.t = { + proc a2____addstate_array_avx2 (st:W256.t Array7.t, buf:W8.t Array2.t, + offset:W64.t, lEN:int, tRAILB:int) : + W256.t Array7.t * W64.t = { var dELTA:int; var t64:W64.t; var t128_0:W128.t; @@ -4086,52 +3937,50 @@ module M(SC:Syscall_t) = { var r2:W256.t; var r6:W256.t; dELTA <- 0; - (dELTA, lEN, tRAILB, t64) <@ nEW_KECCAK__A2____aread_subu64 (buf, - offset, dELTA, lEN, tRAILB); + (dELTA, lEN, tRAILB, t64) <@ a2____aread_subu64 (buf, offset, dELTA, + lEN, tRAILB); t128_0 <- (zeroextu128 t64); r0 <- (VPBROADCAST_4u64 (truncateu64 t128_0)); st.[0] <- (st.[0] `^` r0); - (dELTA, lEN, tRAILB, r1) <@ nEW_KECCAK__A2____aread_subu256 (buf, - offset, dELTA, lEN, tRAILB); + (dELTA, lEN, tRAILB, r1) <@ a2____aread_subu256 (buf, offset, dELTA, + lEN, tRAILB); st.[1] <- (st.[1] `^` r1); if ((0 < lEN)) { - (dELTA, lEN, tRAILB, t64) <@ nEW_KECCAK__A2____aread_subu64 (buf, - offset, dELTA, lEN, tRAILB); + (dELTA, lEN, tRAILB, t64) <@ a2____aread_subu64 (buf, offset, dELTA, + lEN, tRAILB); t128_1 <- (zeroextu128 t64); - (dELTA, lEN, tRAILB, r3) <@ nEW_KECCAK__A2____aread_subu256 (buf, - offset, dELTA, lEN, tRAILB); - (dELTA, lEN, tRAILB, t64) <@ nEW_KECCAK__A2____aread_subu64 (buf, - offset, dELTA, lEN, tRAILB); + (dELTA, lEN, tRAILB, r3) <@ a2____aread_subu256 (buf, offset, dELTA, + lEN, tRAILB); + (dELTA, lEN, tRAILB, t64) <@ a2____aread_subu64 (buf, offset, dELTA, + lEN, tRAILB); t128_0 <- (zeroextu128 t64); - (dELTA, lEN, tRAILB, r4) <@ nEW_KECCAK__A2____aread_subu256 (buf, - offset, dELTA, lEN, tRAILB); - (dELTA, lEN, tRAILB, t64) <@ nEW_KECCAK__A2____aread_subu64 (buf, - offset, dELTA, lEN, tRAILB); + (dELTA, lEN, tRAILB, r4) <@ a2____aread_subu256 (buf, offset, dELTA, + lEN, tRAILB); + (dELTA, lEN, tRAILB, t64) <@ a2____aread_subu64 (buf, offset, dELTA, + lEN, tRAILB); t128_1 <- (VPINSR_2u64 t128_1 t64 (W8.of_int 1)); - (dELTA, lEN, tRAILB, r5) <@ nEW_KECCAK__A2____aread_subu256 (buf, - offset, dELTA, lEN, tRAILB); - (dELTA, lEN, tRAILB, t64) <@ nEW_KECCAK__A2____aread_subu64 (buf, - offset, dELTA, lEN, tRAILB); + (dELTA, lEN, tRAILB, r5) <@ a2____aread_subu256 (buf, offset, dELTA, + lEN, tRAILB); + (dELTA, lEN, tRAILB, t64) <@ a2____aread_subu64 (buf, offset, dELTA, + lEN, tRAILB); t128_0 <- (VPINSR_2u64 t128_0 t64 (W8.of_int 1)); r2 <- (W256.of_int (((W128.to_uint t128_0) %% (2 ^ 128)) + ((2 ^ 128) * (W128.to_uint t128_1)))); st.[2] <- (st.[2] `^` r2); - (dELTA, lEN, tRAILB, r6) <@ nEW_KECCAK__A2____aread_subu256 (buf, - offset, dELTA, lEN, tRAILB); - st <@ nEW_KECCAK____addstate_r3456 (st, r3, r4, r5, r6); + (dELTA, lEN, tRAILB, r6) <@ a2____aread_subu256 (buf, offset, dELTA, + lEN, tRAILB); + st <@ __addstate_r3456 (st, r3, r4, r5, r6); } else { } offset <- (offset + (W64.of_int dELTA)); return (st, offset); } - proc nEW_KECCAK__A2____pstate_array_avx2 (pst:W64.t Array25.t, aT:int, - buf:W8.t Array2.t, offset:W64.t, - lEN:int, tRAILB:int) : W64.t Array25.t * - int * - W64.t = { + proc a2____pstate_array_avx2 (pst:W64.t Array25.t, aT:int, + buf:W8.t Array2.t, offset:W64.t, lEN:int, + tRAILB:int) : W64.t Array25.t * int * W64.t = { var aLL:int; var dELTA:int; var lO:int; @@ -4154,8 +4003,8 @@ module M(SC:Syscall_t) = { } else { } - (dELTA, _2, tRAILB, t64) <@ nEW_KECCAK__A2____aread_subu64 ( - buf, offset, dELTA, lEN, tRAILB); + (dELTA, _2, tRAILB, t64) <@ a2____aread_subu64 (buf, offset, + dELTA, lEN, tRAILB); t64 <- (t64 `<<` (W8.of_int (8 * lO))); pst.[(W64.to_uint at)] <- (pst.[(W64.to_uint at)] `^` t64); lO <- 0; @@ -4168,8 +4017,8 @@ module M(SC:Syscall_t) = { (W64.to_uint (offset + (W64.of_int dELTA)))); dELTA <- (dELTA + (8 - lO)); } else { - (dELTA, _0, _1, t64) <@ nEW_KECCAK__A2____aread_subu64 (buf, - offset, dELTA, (8 - lO), 0); + (dELTA, _0, _1, t64) <@ a2____aread_subu64 (buf, offset, + dELTA, (8 - lO), 0); } lEN <- (lEN - (8 - lO)); aT <- (aT + (8 - lO)); @@ -4236,8 +4085,8 @@ module M(SC:Syscall_t) = { } else { } - (dELTA, _3, tRAILB, t64) <@ nEW_KECCAK__A2____aread_subu64 (buf, - offset, dELTA, lO, tRAILB); + (dELTA, _3, tRAILB, t64) <@ a2____aread_subu64 (buf, offset, dELTA, + lO, tRAILB); pst.[(aLL %/ 8)] <- t64; } else { @@ -4245,10 +4094,9 @@ module M(SC:Syscall_t) = { offset <- (offset + (W64.of_int dELTA)); return (pst, aLL, offset); } - proc nEW_KECCAK__A2____pabsorb_array_avx2 (pst:W64.t Array25.t, aT:int, - st:W256.t Array7.t, - buf:W8.t Array2.t, offset:W64.t, - lEN:int, rATE8:int, tRAILB:int) : + proc a2____pabsorb_array_avx2 (pst:W64.t Array25.t, aT:int, + st:W256.t Array7.t, buf:W8.t Array2.t, + offset:W64.t, lEN:int, rATE8:int, tRAILB:int) : W64.t Array25.t * int * W256.t Array7.t * W64.t = { var aLL:int; var iTERS:int; @@ -4256,8 +4104,8 @@ module M(SC:Syscall_t) = { var _0:int; aLL <- (aT + lEN); if (((aT + lEN) < rATE8)) { - (pst, aT, offset) <@ nEW_KECCAK__A2____pstate_array_avx2 (pst, - aT, buf, offset, lEN, tRAILB); + (pst, aT, offset) <@ a2____pstate_array_avx2 (pst, aT, buf, offset, + lEN, tRAILB); if ((tRAILB <> 0)) { i <- (W64.of_int ((aT %/ 8) + 1)); if ((aT <= (5 * 8))) { @@ -4265,8 +4113,8 @@ module M(SC:Syscall_t) = { pst.[(W64.to_uint i)] <- (W64.of_int 0); i <- (i + (W64.of_int 1)); } - st <@ nEW_KECCAK____addpst01 (st, pst); - st <@ nEW_KECCAK____addratebit_avx2 (st, rATE8); + st <@ __addpst01 (st, pst); + st <@ __addratebit_avx2 (st, rATE8); } else { while ((i \ult (W64.of_int (rATE8 %/ 8)))) { pst.[(W64.to_uint i)] <- (W64.of_int 0); @@ -4279,18 +4127,18 @@ module M(SC:Syscall_t) = { (rATE8 - 1) ((get8 (WArray200.init64 (fun i_0 => pst.[i_0])) (rATE8 - 1)) `^` (W8.of_int 128))))); - st <@ nEW_KECCAK___addpstate_avx2 (st, pst); + st <@ _addpstate_avx2 (st, pst); } } else { } } else { if ((aT <> 0)) { - (pst, _0, offset) <@ nEW_KECCAK__A2____pstate_array_avx2 (pst, - aT, buf, offset, (rATE8 - aT), 0); + (pst, _0, offset) <@ a2____pstate_array_avx2 (pst, aT, buf, + offset, (rATE8 - aT), 0); lEN <- (lEN - (rATE8 - aT)); - st <@ nEW_KECCAK___addpstate_avx2 (st, pst); - st <@ nEW_KECCAK___keccakf1600_avx2 (st); + st <@ _addpstate_avx2 (st, pst); + st <@ _keccakf1600_avx2 (st); aT <- 0; } else { @@ -4298,20 +4146,20 @@ module M(SC:Syscall_t) = { iTERS <- (lEN %/ rATE8); i <- (W64.of_int 0); while ((i \ult (W64.of_int iTERS))) { - (st, offset) <@ nEW_KECCAK__A2____addstate_array_avx2 (st, buf, - offset, rATE8, 0); - st <@ nEW_KECCAK___keccakf1600_avx2 (st); + (st, offset) <@ a2____addstate_array_avx2 (st, buf, offset, rATE8, + 0); + st <@ _keccakf1600_avx2 (st); i <- (i + (W64.of_int 1)); } lEN <- (aLL %% rATE8); if ((tRAILB <> 0)) { - (st, offset) <@ nEW_KECCAK__A2____addstate_array_avx2 (st, buf, - offset, lEN, tRAILB); - st <@ nEW_KECCAK____addratebit_avx2 (st, rATE8); + (st, offset) <@ a2____addstate_array_avx2 (st, buf, offset, lEN, + tRAILB); + st <@ __addratebit_avx2 (st, rATE8); } else { if ((lEN <> 0)) { - (pst, aT, offset) <@ nEW_KECCAK__A2____pstate_array_avx2 (pst, 0, - buf, offset, lEN, tRAILB); + (pst, aT, offset) <@ a2____pstate_array_avx2 (pst, 0, buf, + offset, lEN, tRAILB); } else { } @@ -4319,14 +4167,11 @@ module M(SC:Syscall_t) = { } return (pst, aT, st, offset); } - proc nEW_KECCAK__A2____addstate_array_avx2x4 (st:W256.t Array25.t, aT:int, - buf0:W8.t Array2.t, - buf1:W8.t Array2.t, - buf2:W8.t Array2.t, - buf3:W8.t Array2.t, - offset:W64.t, lEN:int, - tRAILB:int) : W256.t Array25.t * - int * W64.t = { + proc a2____addstate_array_avx2x4 (st:W256.t Array25.t, aT:int, + buf0:W8.t Array2.t, buf1:W8.t Array2.t, + buf2:W8.t Array2.t, buf3:W8.t Array2.t, + offset:W64.t, lEN:int, tRAILB:int) : + W256.t Array25.t * int * W64.t = { var aLL:int; var lO:int; var at:W64.t; @@ -4383,14 +4228,14 @@ module M(SC:Syscall_t) = { } else { } - ( _11, _12, _13, t0) <@ nEW_KECCAK__A2____aread_subu64 (buf0, - offset, dELTA, lEN, tRAILB); - ( _14, _15, _16, t1) <@ nEW_KECCAK__A2____aread_subu64 (buf1, - offset, dELTA, lEN, tRAILB); - ( _17, _18, _19, t2) <@ nEW_KECCAK__A2____aread_subu64 (buf2, - offset, dELTA, lEN, tRAILB); - (dELTA, _20, _21, t3) <@ nEW_KECCAK__A2____aread_subu64 (buf3, - offset, dELTA, lEN, tRAILB); + ( _11, _12, _13, t0) <@ a2____aread_subu64 (buf0, offset, dELTA, + lEN, tRAILB); + ( _14, _15, _16, t1) <@ a2____aread_subu64 (buf1, offset, dELTA, + lEN, tRAILB); + ( _17, _18, _19, t2) <@ a2____aread_subu64 (buf2, offset, dELTA, + lEN, tRAILB); + (dELTA, _20, _21, t3) <@ a2____aread_subu64 (buf3, offset, + dELTA, lEN, tRAILB); t0 <- (t0 `<<` (W8.of_int (8 * lO))); t0 <- (t0 `^` @@ -4451,14 +4296,14 @@ module M(SC:Syscall_t) = { (W64.to_uint (offset + (W64.of_int dELTA)))); offset <- (offset + (W64.of_int (8 - lO))); } else { - ( _0, _1, _2, t0) <@ nEW_KECCAK__A2____aread_subu64 (buf0, - offset, dELTA, (8 - lO), tRAILB); - ( _3, _4, _5, t1) <@ nEW_KECCAK__A2____aread_subu64 (buf1, - offset, dELTA, (8 - lO), tRAILB); - ( _6, _7, _8, t2) <@ nEW_KECCAK__A2____aread_subu64 (buf2, - offset, dELTA, (8 - lO), tRAILB); - (dELTA, _9, _10, t3) <@ nEW_KECCAK__A2____aread_subu64 (buf3, - offset, dELTA, (8 - lO), tRAILB); + ( _0, _1, _2, t0) <@ a2____aread_subu64 (buf0, offset, dELTA, + (8 - lO), tRAILB); + ( _3, _4, _5, t1) <@ a2____aread_subu64 (buf1, offset, dELTA, + (8 - lO), tRAILB); + ( _6, _7, _8, t2) <@ a2____aread_subu64 (buf2, offset, dELTA, + (8 - lO), tRAILB); + (dELTA, _9, _10, t3) <@ a2____aread_subu64 (buf3, offset, + dELTA, (8 - lO), tRAILB); } lEN <- (lEN - (8 - lO)); aT <- (aT + (8 - lO)); @@ -4524,8 +4369,8 @@ module M(SC:Syscall_t) = { (get256_direct (WArray2.init8 (fun i => buf0.[i])) (W64.to_uint offset)); offset <- (offset + (W64.of_int 32)); - (t256_0, t256_1, t256_2, t256_3) <@ nEW_KECCAK____4u64x4_u256x4 ( - t256_0, t256_1, t256_2, t256_3); + (t256_0, t256_1, t256_2, t256_3) <@ __4u64x4_u256x4 (t256_0, + t256_1, t256_2, t256_3); st <- (Array25.init (WArray800.get256 @@ -4606,14 +4451,14 @@ module M(SC:Syscall_t) = { } lO <- ((aT + lEN) %% 8); if (((0 < lO) \/ (tRAILB <> 0))) { - ( _22, _23, _24, t0) <@ nEW_KECCAK__A2____aread_subu64 (buf0, - offset, dELTA, lO, tRAILB); - ( _25, _26, _27, t1) <@ nEW_KECCAK__A2____aread_subu64 (buf1, - offset, dELTA, lO, tRAILB); - ( _28, _29, _30, t2) <@ nEW_KECCAK__A2____aread_subu64 (buf2, - offset, dELTA, lO, tRAILB); - (dELTA, _31, _32, t3) <@ nEW_KECCAK__A2____aread_subu64 (buf3, - offset, dELTA, lO, tRAILB); + ( _22, _23, _24, t0) <@ a2____aread_subu64 (buf0, offset, dELTA, + lO, tRAILB); + ( _25, _26, _27, t1) <@ a2____aread_subu64 (buf1, offset, dELTA, + lO, tRAILB); + ( _28, _29, _30, t2) <@ a2____aread_subu64 (buf2, offset, dELTA, + lO, tRAILB); + (dELTA, _31, _32, t3) <@ a2____aread_subu64 (buf3, offset, dELTA, + lO, tRAILB); offset <- (offset + (W64.of_int dELTA)); if ((tRAILB <> 0)) { aLL <- (aLL + 1); @@ -4662,14 +4507,12 @@ module M(SC:Syscall_t) = { } return (st, aLL, offset); } - proc nEW_KECCAK__A2____absorb_array_avx2x4 (st:W256.t Array25.t, aT:int, - buf0:W8.t Array2.t, - buf1:W8.t Array2.t, - buf2:W8.t Array2.t, - buf3:W8.t Array2.t, - offset:W64.t, lEN:int, - rATE8:int, tRAILB:int) : - W256.t Array25.t * int * W64.t = { + proc a2____absorb_array_avx2x4 (st:W256.t Array25.t, aT:int, + buf0:W8.t Array2.t, buf1:W8.t Array2.t, + buf2:W8.t Array2.t, buf3:W8.t Array2.t, + offset:W64.t, lEN:int, rATE8:int, + tRAILB:int) : W256.t Array25.t * int * + W64.t = { var aLL:int; var iTERS:int; var i:W64.t; @@ -4677,19 +4520,19 @@ module M(SC:Syscall_t) = { var _1:int; aLL <- (aT + lEN); if (((aT + lEN) < rATE8)) { - (st, aT, offset) <@ nEW_KECCAK__A2____addstate_array_avx2x4 (st, - aT, buf0, buf1, buf2, buf3, offset, lEN, tRAILB); + (st, aT, offset) <@ a2____addstate_array_avx2x4 (st, aT, buf0, + buf1, buf2, buf3, offset, lEN, tRAILB); if ((tRAILB <> 0)) { - st <@ nEW_KECCAK____addratebit_avx2x4 (st, rATE8); + st <@ __addratebit_avx2x4 (st, rATE8); } else { } } else { if ((aT <> 0)) { - (st, _0, offset) <@ nEW_KECCAK__A2____addstate_array_avx2x4 ( - st, aT, buf0, buf1, buf2, buf3, offset, (rATE8 - aT), 0); + (st, _0, offset) <@ a2____addstate_array_avx2x4 (st, aT, buf0, + buf1, buf2, buf3, offset, (rATE8 - aT), 0); lEN <- (lEN - (rATE8 - aT)); - st <@ nEW_KECCAK___keccakf1600_avx2x4 (st); + st <@ _keccakf1600_avx2x4 (st); aT <- 0; } else { @@ -4697,25 +4540,24 @@ module M(SC:Syscall_t) = { iTERS <- (lEN %/ rATE8); i <- (W64.of_int 0); while ((i \ult (W64.of_int iTERS))) { - (st, _1, offset) <@ nEW_KECCAK__A2____addstate_array_avx2x4 ( - st, 0, buf0, buf1, buf2, buf3, offset, rATE8, 0); - st <@ nEW_KECCAK___keccakf1600_avx2x4 (st); + (st, _1, offset) <@ a2____addstate_array_avx2x4 (st, 0, buf0, + buf1, buf2, buf3, offset, rATE8, 0); + st <@ _keccakf1600_avx2x4 (st); i <- (i + (W64.of_int 1)); } lEN <- (aLL %% rATE8); - (st, aT, offset) <@ nEW_KECCAK__A2____addstate_array_avx2x4 (st, 0, - buf0, buf1, buf2, buf3, offset, lEN, tRAILB); + (st, aT, offset) <@ a2____addstate_array_avx2x4 (st, 0, buf0, buf1, + buf2, buf3, offset, lEN, tRAILB); if ((tRAILB <> 0)) { - st <@ nEW_KECCAK____addratebit_avx2x4 (st, rATE8); + st <@ __addratebit_avx2x4 (st, rATE8); } else { } } return (st, aT, offset); } - proc nEW_KECCAK__A32____aread_subu64 (buf:W8.t Array32.t, offset:W64.t, - dELTA:int, lEN:int, tRAIL:int) : - int * int * int * W64.t = { + proc a32____aread_subu64 (buf:W8.t Array32.t, offset:W64.t, dELTA:int, + lEN:int, tRAIL:int) : int * int * int * W64.t = { var w:W64.t; var iLEN:int; var t16:W64.t; @@ -4780,10 +4622,11 @@ module M(SC:Syscall_t) = { } return (dELTA, lEN, tRAIL, w); } - proc nEW_KECCAK__A32____aread_bcast_4subu64 (buf:W8.t Array32.t, - offset:W64.t, dELTA:int, - lEN:int, tRAIL:int) : - int * int * int * W256.t = { + proc a32____aread_bcast_4subu64 (buf:W8.t Array32.t, offset:W64.t, + dELTA:int, lEN:int, tRAIL:int) : int * + int * + int * + W256.t = { var w:W256.t; var t64:W64.t; var t128:W128.t; @@ -4798,17 +4641,16 @@ module M(SC:Syscall_t) = { dELTA <- (dELTA + 8); lEN <- (lEN - 8); } else { - (dELTA, lEN, tRAIL, t64) <@ nEW_KECCAK__A32____aread_subu64 ( - buf, offset, dELTA, lEN, tRAIL); + (dELTA, lEN, tRAIL, t64) <@ a32____aread_subu64 (buf, offset, + dELTA, lEN, tRAIL); t128 <- (zeroextu128 t64); w <- (VPBROADCAST_4u64 (truncateu64 t128)); } } return (dELTA, lEN, tRAIL, w); } - proc nEW_KECCAK__A32____aread_subu128 (buf:W8.t Array32.t, offset:W64.t, - dELTA:int, lEN:int, tRAIL:int) : - int * int * int * W128.t = { + proc a32____aread_subu128 (buf:W8.t Array32.t, offset:W64.t, dELTA:int, + lEN:int, tRAIL:int) : int * int * int * W128.t = { var w:W128.t; var t64:W64.t; if (((lEN <= 0) /\ (tRAIL = 0))) { @@ -4828,21 +4670,20 @@ module M(SC:Syscall_t) = { (W64.to_uint (offset + (W64.of_int dELTA))))); dELTA <- (dELTA + 8); lEN <- (lEN - 8); - (dELTA, lEN, tRAIL, t64) <@ nEW_KECCAK__A32____aread_subu64 ( - buf, offset, dELTA, lEN, tRAIL); + (dELTA, lEN, tRAIL, t64) <@ a32____aread_subu64 (buf, offset, + dELTA, lEN, tRAIL); w <- (VPINSR_2u64 w t64 (W8.of_int 1)); } else { - (dELTA, lEN, tRAIL, t64) <@ nEW_KECCAK__A32____aread_subu64 ( - buf, offset, dELTA, lEN, tRAIL); + (dELTA, lEN, tRAIL, t64) <@ a32____aread_subu64 (buf, offset, + dELTA, lEN, tRAIL); w <- (zeroextu128 t64); } } } return (dELTA, lEN, tRAIL, w); } - proc nEW_KECCAK__A32____aread_subu256 (buf:W8.t Array32.t, offset:W64.t, - dELTA:int, lEN:int, tRAIL:int) : - int * int * int * W256.t = { + proc a32____aread_subu256 (buf:W8.t Array32.t, offset:W64.t, dELTA:int, + lEN:int, tRAIL:int) : int * int * int * W256.t = { var w:W256.t; var t128_1:W128.t; var t128_0:W128.t; @@ -4862,16 +4703,16 @@ module M(SC:Syscall_t) = { (W64.to_uint (offset + (W64.of_int dELTA)))); dELTA <- (dELTA + 16); lEN <- (lEN - 16); - (dELTA, lEN, tRAIL, t128_1) <@ nEW_KECCAK__A32____aread_subu128 ( - buf, offset, dELTA, lEN, tRAIL); + (dELTA, lEN, tRAIL, t128_1) <@ a32____aread_subu128 (buf, offset, + dELTA, lEN, tRAIL); w <- (W256.of_int (((W128.to_uint t128_0) %% (2 ^ 128)) + ((2 ^ 128) * (W128.to_uint t128_1)))); } else { t128_1 <- (set0_128); - (dELTA, lEN, tRAIL, t128_0) <@ nEW_KECCAK__A32____aread_subu128 ( - buf, offset, dELTA, lEN, tRAIL); + (dELTA, lEN, tRAIL, t128_0) <@ a32____aread_subu128 (buf, offset, + dELTA, lEN, tRAIL); w <- (W256.of_int (((W128.to_uint t128_0) %% (2 ^ 128)) + @@ -4881,9 +4722,8 @@ module M(SC:Syscall_t) = { } return (dELTA, lEN, tRAIL, w); } - proc nEW_KECCAK__A32____awrite_subu64 (buf:W8.t Array32.t, offset:W64.t, - dELTA:int, lEN:int, w:W64.t) : - W8.t Array32.t * int * int = { + proc a32____awrite_subu64 (buf:W8.t Array32.t, offset:W64.t, dELTA:int, + lEN:int, w:W64.t) : W8.t Array32.t * int * int = { if ((0 < lEN)) { if ((8 <= lEN)) { @@ -4936,9 +4776,8 @@ module M(SC:Syscall_t) = { } return (buf, dELTA, lEN); } - proc nEW_KECCAK__A32____awrite_subu128 (buf:W8.t Array32.t, offset:W64.t, - dELTA:int, lEN:int, w:W128.t) : - W8.t Array32.t * int * int = { + proc a32____awrite_subu128 (buf:W8.t Array32.t, offset:W64.t, dELTA:int, + lEN:int, w:W128.t) : W8.t Array32.t * int * int = { var t64:W64.t; if ((0 < lEN)) { if ((16 <= lEN)) { @@ -4964,17 +4803,16 @@ module M(SC:Syscall_t) = { } t64 <- (truncateu64 w); - (buf, dELTA, lEN) <@ nEW_KECCAK__A32____awrite_subu64 (buf, offset, - dELTA, lEN, t64); + (buf, dELTA, lEN) <@ a32____awrite_subu64 (buf, offset, dELTA, + lEN, t64); } } else { } return (buf, dELTA, lEN); } - proc nEW_KECCAK__A32____awrite_subu256 (buf:W8.t Array32.t, offset:W64.t, - dELTA:int, lEN:int, w:W256.t) : - W8.t Array32.t * int * int = { + proc a32____awrite_subu256 (buf:W8.t Array32.t, offset:W64.t, dELTA:int, + lEN:int, w:W256.t) : W8.t Array32.t * int * int = { var t128:W128.t; if ((0 < lEN)) { if ((32 <= lEN)) { @@ -4998,19 +4836,17 @@ module M(SC:Syscall_t) = { } else { t128 <- (truncateu128 w); } - (buf, dELTA, lEN) <@ nEW_KECCAK__A32____awrite_subu128 (buf, - offset, dELTA, lEN, t128); + (buf, dELTA, lEN) <@ a32____awrite_subu128 (buf, offset, dELTA, + lEN, t128); } } else { } return (buf, dELTA, lEN); } - proc nEW_KECCAK__A32____addstate_array_avx2 (st:W256.t Array7.t, - buf:W8.t Array32.t, - offset:W64.t, lEN:int, - tRAILB:int) : W256.t Array7.t * - W64.t = { + proc a32____addstate_array_avx2 (st:W256.t Array7.t, buf:W8.t Array32.t, + offset:W64.t, lEN:int, tRAILB:int) : + W256.t Array7.t * W64.t = { var dELTA:int; var t64:W64.t; var t128_0:W128.t; @@ -5023,51 +4859,49 @@ module M(SC:Syscall_t) = { var r2:W256.t; var r6:W256.t; dELTA <- 0; - (dELTA, lEN, tRAILB, t64) <@ nEW_KECCAK__A32____aread_subu64 (buf, - offset, dELTA, lEN, tRAILB); + (dELTA, lEN, tRAILB, t64) <@ a32____aread_subu64 (buf, offset, dELTA, + lEN, tRAILB); t128_0 <- (zeroextu128 t64); r0 <- (VPBROADCAST_4u64 (truncateu64 t128_0)); st.[0] <- (st.[0] `^` r0); - (dELTA, lEN, tRAILB, r1) <@ nEW_KECCAK__A32____aread_subu256 (buf, - offset, dELTA, lEN, tRAILB); + (dELTA, lEN, tRAILB, r1) <@ a32____aread_subu256 (buf, offset, dELTA, + lEN, tRAILB); st.[1] <- (st.[1] `^` r1); if ((0 < lEN)) { - (dELTA, lEN, tRAILB, t64) <@ nEW_KECCAK__A32____aread_subu64 (buf, - offset, dELTA, lEN, tRAILB); + (dELTA, lEN, tRAILB, t64) <@ a32____aread_subu64 (buf, offset, + dELTA, lEN, tRAILB); t128_1 <- (zeroextu128 t64); - (dELTA, lEN, tRAILB, r3) <@ nEW_KECCAK__A32____aread_subu256 (buf, - offset, dELTA, lEN, tRAILB); - (dELTA, lEN, tRAILB, t64) <@ nEW_KECCAK__A32____aread_subu64 (buf, - offset, dELTA, lEN, tRAILB); + (dELTA, lEN, tRAILB, r3) <@ a32____aread_subu256 (buf, offset, + dELTA, lEN, tRAILB); + (dELTA, lEN, tRAILB, t64) <@ a32____aread_subu64 (buf, offset, + dELTA, lEN, tRAILB); t128_0 <- (zeroextu128 t64); - (dELTA, lEN, tRAILB, r4) <@ nEW_KECCAK__A32____aread_subu256 (buf, - offset, dELTA, lEN, tRAILB); - (dELTA, lEN, tRAILB, t64) <@ nEW_KECCAK__A32____aread_subu64 (buf, - offset, dELTA, lEN, tRAILB); + (dELTA, lEN, tRAILB, r4) <@ a32____aread_subu256 (buf, offset, + dELTA, lEN, tRAILB); + (dELTA, lEN, tRAILB, t64) <@ a32____aread_subu64 (buf, offset, + dELTA, lEN, tRAILB); t128_1 <- (VPINSR_2u64 t128_1 t64 (W8.of_int 1)); - (dELTA, lEN, tRAILB, r5) <@ nEW_KECCAK__A32____aread_subu256 (buf, - offset, dELTA, lEN, tRAILB); - (dELTA, lEN, tRAILB, t64) <@ nEW_KECCAK__A32____aread_subu64 (buf, - offset, dELTA, lEN, tRAILB); + (dELTA, lEN, tRAILB, r5) <@ a32____aread_subu256 (buf, offset, + dELTA, lEN, tRAILB); + (dELTA, lEN, tRAILB, t64) <@ a32____aread_subu64 (buf, offset, + dELTA, lEN, tRAILB); t128_0 <- (VPINSR_2u64 t128_0 t64 (W8.of_int 1)); r2 <- (W256.of_int (((W128.to_uint t128_0) %% (2 ^ 128)) + ((2 ^ 128) * (W128.to_uint t128_1)))); st.[2] <- (st.[2] `^` r2); - (dELTA, lEN, tRAILB, r6) <@ nEW_KECCAK__A32____aread_subu256 (buf, - offset, dELTA, lEN, tRAILB); - st <@ nEW_KECCAK____addstate_r3456 (st, r3, r4, r5, r6); + (dELTA, lEN, tRAILB, r6) <@ a32____aread_subu256 (buf, offset, + dELTA, lEN, tRAILB); + st <@ __addstate_r3456 (st, r3, r4, r5, r6); } else { } offset <- (offset + (W64.of_int dELTA)); return (st, offset); } - proc nEW_KECCAK__A32____absorb_array_avx2 (st:W256.t Array7.t, - buf:W8.t Array32.t, - offset:W64.t, lEN:int, - rATE8:int, tRAILB:int) : + proc a32____absorb_array_avx2 (st:W256.t Array7.t, buf:W8.t Array32.t, + offset:W64.t, lEN:int, rATE8:int, tRAILB:int) : W256.t Array7.t * W64.t = { var aLL:int; var iTERS:int; @@ -5077,29 +4911,27 @@ module M(SC:Syscall_t) = { if ((0 < iTERS)) { i <- (W64.of_int 0); while ((i \ult (W64.of_int iTERS))) { - (st, offset) <@ nEW_KECCAK__A32____addstate_array_avx2 (st, buf, - offset, rATE8, 0); - st <@ nEW_KECCAK___keccakf1600_avx2 (st); + (st, offset) <@ a32____addstate_array_avx2 (st, buf, offset, + rATE8, 0); + st <@ _keccakf1600_avx2 (st); i <- (i + (W64.of_int 1)); } } else { } lEN <- (lEN %% rATE8); - (st, offset) <@ nEW_KECCAK__A32____addstate_array_avx2 (st, buf, - offset, lEN, tRAILB); + (st, offset) <@ a32____addstate_array_avx2 (st, buf, offset, lEN, + tRAILB); if ((tRAILB <> 0)) { - st <@ nEW_KECCAK____addratebit_avx2 (st, rATE8); + st <@ __addratebit_avx2 (st, rATE8); } else { } return (st, offset); } - proc nEW_KECCAK__A32____pstate_array_avx2 (pst:W64.t Array25.t, aT:int, - buf:W8.t Array32.t, - offset:W64.t, lEN:int, - tRAILB:int) : W64.t Array25.t * - int * W64.t = { + proc a32____pstate_array_avx2 (pst:W64.t Array25.t, aT:int, + buf:W8.t Array32.t, offset:W64.t, lEN:int, + tRAILB:int) : W64.t Array25.t * int * W64.t = { var aLL:int; var dELTA:int; var lO:int; @@ -5122,8 +4954,8 @@ module M(SC:Syscall_t) = { } else { } - (dELTA, _2, tRAILB, t64) <@ nEW_KECCAK__A32____aread_subu64 ( - buf, offset, dELTA, lEN, tRAILB); + (dELTA, _2, tRAILB, t64) <@ a32____aread_subu64 (buf, offset, + dELTA, lEN, tRAILB); t64 <- (t64 `<<` (W8.of_int (8 * lO))); pst.[(W64.to_uint at)] <- (pst.[(W64.to_uint at)] `^` t64); lO <- 0; @@ -5136,8 +4968,8 @@ module M(SC:Syscall_t) = { (W64.to_uint (offset + (W64.of_int dELTA)))); dELTA <- (dELTA + (8 - lO)); } else { - (dELTA, _0, _1, t64) <@ nEW_KECCAK__A32____aread_subu64 ( - buf, offset, dELTA, (8 - lO), 0); + (dELTA, _0, _1, t64) <@ a32____aread_subu64 (buf, offset, + dELTA, (8 - lO), 0); } lEN <- (lEN - (8 - lO)); aT <- (aT + (8 - lO)); @@ -5204,8 +5036,8 @@ module M(SC:Syscall_t) = { } else { } - (dELTA, _3, tRAILB, t64) <@ nEW_KECCAK__A32____aread_subu64 (buf, - offset, dELTA, lO, tRAILB); + (dELTA, _3, tRAILB, t64) <@ a32____aread_subu64 (buf, offset, + dELTA, lO, tRAILB); pst.[(aLL %/ 8)] <- t64; } else { @@ -5213,20 +5045,19 @@ module M(SC:Syscall_t) = { offset <- (offset + (W64.of_int dELTA)); return (pst, aLL, offset); } - proc nEW_KECCAK__A32____pabsorb_array_avx2 (pst:W64.t Array25.t, aT:int, - st:W256.t Array7.t, - buf:W8.t Array32.t, - offset:W64.t, lEN:int, - rATE8:int, tRAILB:int) : - W64.t Array25.t * int * W256.t Array7.t * W64.t = { + proc a32____pabsorb_array_avx2 (pst:W64.t Array25.t, aT:int, + st:W256.t Array7.t, buf:W8.t Array32.t, + offset:W64.t, lEN:int, rATE8:int, + tRAILB:int) : W64.t Array25.t * int * + W256.t Array7.t * W64.t = { var aLL:int; var iTERS:int; var i:W64.t; var _0:int; aLL <- (aT + lEN); if (((aT + lEN) < rATE8)) { - (pst, aT, offset) <@ nEW_KECCAK__A32____pstate_array_avx2 (pst, - aT, buf, offset, lEN, tRAILB); + (pst, aT, offset) <@ a32____pstate_array_avx2 (pst, aT, buf, offset, + lEN, tRAILB); if ((tRAILB <> 0)) { i <- (W64.of_int ((aT %/ 8) + 1)); if ((aT <= (5 * 8))) { @@ -5234,8 +5065,8 @@ module M(SC:Syscall_t) = { pst.[(W64.to_uint i)] <- (W64.of_int 0); i <- (i + (W64.of_int 1)); } - st <@ nEW_KECCAK____addpst01 (st, pst); - st <@ nEW_KECCAK____addratebit_avx2 (st, rATE8); + st <@ __addpst01 (st, pst); + st <@ __addratebit_avx2 (st, rATE8); } else { while ((i \ult (W64.of_int (rATE8 %/ 8)))) { pst.[(W64.to_uint i)] <- (W64.of_int 0); @@ -5248,18 +5079,18 @@ module M(SC:Syscall_t) = { (rATE8 - 1) ((get8 (WArray200.init64 (fun i_0 => pst.[i_0])) (rATE8 - 1)) `^` (W8.of_int 128))))); - st <@ nEW_KECCAK___addpstate_avx2 (st, pst); + st <@ _addpstate_avx2 (st, pst); } } else { } } else { if ((aT <> 0)) { - (pst, _0, offset) <@ nEW_KECCAK__A32____pstate_array_avx2 (pst, - aT, buf, offset, (rATE8 - aT), 0); + (pst, _0, offset) <@ a32____pstate_array_avx2 (pst, aT, buf, + offset, (rATE8 - aT), 0); lEN <- (lEN - (rATE8 - aT)); - st <@ nEW_KECCAK___addpstate_avx2 (st, pst); - st <@ nEW_KECCAK___keccakf1600_avx2 (st); + st <@ _addpstate_avx2 (st, pst); + st <@ _keccakf1600_avx2 (st); aT <- 0; } else { @@ -5267,20 +5098,20 @@ module M(SC:Syscall_t) = { iTERS <- (lEN %/ rATE8); i <- (W64.of_int 0); while ((i \ult (W64.of_int iTERS))) { - (st, offset) <@ nEW_KECCAK__A32____addstate_array_avx2 (st, buf, - offset, rATE8, 0); - st <@ nEW_KECCAK___keccakf1600_avx2 (st); + (st, offset) <@ a32____addstate_array_avx2 (st, buf, offset, + rATE8, 0); + st <@ _keccakf1600_avx2 (st); i <- (i + (W64.of_int 1)); } lEN <- (aLL %% rATE8); if ((tRAILB <> 0)) { - (st, offset) <@ nEW_KECCAK__A32____addstate_array_avx2 (st, buf, - offset, lEN, tRAILB); - st <@ nEW_KECCAK____addratebit_avx2 (st, rATE8); + (st, offset) <@ a32____addstate_array_avx2 (st, buf, offset, + lEN, tRAILB); + st <@ __addratebit_avx2 (st, rATE8); } else { if ((lEN <> 0)) { - (pst, aT, offset) <@ nEW_KECCAK__A32____pstate_array_avx2 ( - pst, 0, buf, offset, lEN, tRAILB); + (pst, aT, offset) <@ a32____pstate_array_avx2 (pst, 0, buf, + offset, lEN, tRAILB); } else { } @@ -5288,10 +5119,9 @@ module M(SC:Syscall_t) = { } return (pst, aT, st, offset); } - proc nEW_KECCAK__A32____dumpstate_array_avx2 (buf:W8.t Array32.t, - offset:W64.t, lEN:int, - st:W256.t Array7.t) : - W8.t Array32.t * W64.t = { + proc a32____dumpstate_array_avx2 (buf:W8.t Array32.t, offset:W64.t, + lEN:int, st:W256.t Array7.t) : W8.t Array32.t * + W64.t = { var dELTA:int; var t128_0:W128.t; var t128_1:W128.t; @@ -5304,21 +5134,20 @@ module M(SC:Syscall_t) = { var _0:int; dELTA <- 0; if ((8 <= lEN)) { - (buf, dELTA, _0) <@ nEW_KECCAK__A32____awrite_subu256 (buf, offset, - dELTA, 8, st.[0]); + (buf, dELTA, _0) <@ a32____awrite_subu256 (buf, offset, dELTA, 8, + st.[0]); lEN <- (lEN - 8); } else { - (buf, dELTA, lEN) <@ nEW_KECCAK__A32____awrite_subu256 (buf, offset, - dELTA, lEN, st.[0]); + (buf, dELTA, lEN) <@ a32____awrite_subu256 (buf, offset, dELTA, + lEN, st.[0]); } - (buf, dELTA, lEN) <@ nEW_KECCAK__A32____awrite_subu256 (buf, offset, - dELTA, lEN, st.[1]); + (buf, dELTA, lEN) <@ a32____awrite_subu256 (buf, offset, dELTA, lEN, + st.[1]); if ((0 < lEN)) { t128_0 <- (truncateu128 st.[2]); t128_1 <- (VEXTRACTI128 st.[2] (W8.of_int 1)); t <- (truncateu64 t128_1); - (buf, dELTA, lEN) <@ nEW_KECCAK__A32____awrite_subu64 (buf, offset, - dELTA, lEN, t); + (buf, dELTA, lEN) <@ a32____awrite_subu64 (buf, offset, dELTA, lEN, t); t128_1 <- (VPUNPCKH_2u64 t128_1 t128_1); } else { @@ -5399,12 +5228,12 @@ module M(SC:Syscall_t) = { ((2 ^ 1) * ((0 %% (2 ^ 1)) + ((2 ^ 1) * ((1 %% (2 ^ 1)) + ((2 ^ 1) * 1)))))))))))))) )); - (buf, dELTA, lEN) <@ nEW_KECCAK__A32____awrite_subu256 (buf, offset, - dELTA, lEN, t256_4); + (buf, dELTA, lEN) <@ a32____awrite_subu256 (buf, offset, dELTA, + lEN, t256_4); if ((0 < lEN)) { t <- (truncateu64 t128_0); - (buf, dELTA, lEN) <@ nEW_KECCAK__A32____awrite_subu64 (buf, offset, - dELTA, lEN, t); + (buf, dELTA, lEN) <@ a32____awrite_subu64 (buf, offset, dELTA, + lEN, t); t128_0 <- (VPUNPCKH_2u64 t128_0 t128_0); } else { @@ -5425,15 +5254,15 @@ module M(SC:Syscall_t) = { ((2 ^ 1) * ((0 %% (2 ^ 1)) + ((2 ^ 1) * ((1 %% (2 ^ 1)) + ((2 ^ 1) * 1)))))))))))))) )); - (buf, dELTA, lEN) <@ nEW_KECCAK__A32____awrite_subu256 (buf, - offset, dELTA, lEN, t256_4); + (buf, dELTA, lEN) <@ a32____awrite_subu256 (buf, offset, dELTA, + lEN, t256_4); } else { } if ((0 < lEN)) { t <- (truncateu64 t128_1); - (buf, dELTA, lEN) <@ nEW_KECCAK__A32____awrite_subu64 (buf, offset, - dELTA, lEN, t); + (buf, dELTA, lEN) <@ a32____awrite_subu64 (buf, offset, dELTA, + lEN, t); } else { } @@ -5453,15 +5282,15 @@ module M(SC:Syscall_t) = { ((2 ^ 1) * ((0 %% (2 ^ 1)) + ((2 ^ 1) * ((1 %% (2 ^ 1)) + ((2 ^ 1) * 1)))))))))))))) )); - (buf, dELTA, lEN) <@ nEW_KECCAK__A32____awrite_subu256 (buf, - offset, dELTA, lEN, t256_4); + (buf, dELTA, lEN) <@ a32____awrite_subu256 (buf, offset, dELTA, + lEN, t256_4); } else { } if ((0 < lEN)) { t <- (truncateu64 t128_0); - (buf, dELTA, lEN) <@ nEW_KECCAK__A32____awrite_subu64 (buf, offset, - dELTA, lEN, t); + (buf, dELTA, lEN) <@ a32____awrite_subu64 (buf, offset, dELTA, + lEN, t); } else { } @@ -5481,8 +5310,8 @@ module M(SC:Syscall_t) = { ((2 ^ 1) * ((0 %% (2 ^ 1)) + ((2 ^ 1) * ((1 %% (2 ^ 1)) + ((2 ^ 1) * 1)))))))))))))) )); - (buf, dELTA, lEN) <@ nEW_KECCAK__A32____awrite_subu256 (buf, - offset, dELTA, lEN, t256_4); + (buf, dELTA, lEN) <@ a32____awrite_subu256 (buf, offset, dELTA, + lEN, t256_4); } else { } @@ -5492,10 +5321,9 @@ module M(SC:Syscall_t) = { offset <- (offset + (W64.of_int dELTA)); return (buf, offset); } - proc nEW_KECCAK__A32____squeeze_array_avx2 (buf:W8.t Array32.t, - offset:W64.t, lEN:int, - st:W256.t Array7.t, rATE8:int) : - W8.t Array32.t * W256.t Array7.t = { + proc a32____squeeze_array_avx2 (buf:W8.t Array32.t, offset:W64.t, lEN:int, + st:W256.t Array7.t, rATE8:int) : W8.t Array32.t * + W256.t Array7.t = { var iTERS:int; var lO:int; var i:W64.t; @@ -5505,18 +5333,17 @@ module M(SC:Syscall_t) = { if ((0 < iTERS)) { i <- (W64.of_int 0); while ((i \ult (W64.of_int iTERS))) { - st <@ nEW_KECCAK___keccakf1600_avx2 (st); - (buf, offset) <@ nEW_KECCAK__A32____dumpstate_array_avx2 (buf, - offset, rATE8, st); + st <@ _keccakf1600_avx2 (st); + (buf, offset) <@ a32____dumpstate_array_avx2 (buf, offset, + rATE8, st); i <- (i + (W64.of_int 1)); } } else { } if ((0 < lO)) { - st <@ nEW_KECCAK___keccakf1600_avx2 (st); - (buf, offset) <@ nEW_KECCAK__A32____dumpstate_array_avx2 (buf, - offset, lO, st); + st <@ _keccakf1600_avx2 (st); + (buf, offset) <@ a32____dumpstate_array_avx2 (buf, offset, lO, st); } else { } @@ -5525,12 +5352,10 @@ module M(SC:Syscall_t) = { } return (buf, st); } - proc nEW_KECCAK__A32____addstate_bcast_array_avx2x4 (st:W256.t Array25.t, - aT:int, - buf:W8.t Array32.t, - offset:W64.t, lEN:int, - tRAILB:int) : - W256.t Array25.t * int * W64.t = { + proc a32____addstate_bcast_array_avx2x4 (st:W256.t Array25.t, aT:int, + buf:W8.t Array32.t, offset:W64.t, + lEN:int, tRAILB:int) : W256.t Array25.t * + int * W64.t = { var aLL:int; var lO:int; var at:W64.t; @@ -5551,8 +5376,8 @@ module M(SC:Syscall_t) = { } else { } - (dELTA, _2, tRAILB, t256) <@ nEW_KECCAK__A32____aread_bcast_4subu64 ( - buf, offset, dELTA, lEN, tRAILB); + (dELTA, _2, tRAILB, t256) <@ a32____aread_bcast_4subu64 (buf, + offset, dELTA, lEN, tRAILB); t256 <- (VPSLL_4u64 t256 (W128.of_int (8 * lO))); t256 <- (t256 `^` @@ -5574,8 +5399,8 @@ module M(SC:Syscall_t) = { (W64.to_uint (offset + (W64.of_int dELTA))))); dELTA <- (dELTA + (8 - lO)); } else { - (dELTA, _0, _1, t256) <@ nEW_KECCAK__A32____aread_bcast_4subu64 ( - buf, offset, dELTA, (8 - lO), 0); + (dELTA, _0, _1, t256) <@ a32____aread_bcast_4subu64 (buf, + offset, dELTA, (8 - lO), 0); } lEN <- (lEN - (8 - lO)); aT <- (aT + (8 - lO)); @@ -5625,8 +5450,8 @@ module M(SC:Syscall_t) = { } else { } - (dELTA, _3, tRAILB, t256) <@ nEW_KECCAK__A32____aread_bcast_4subu64 ( - buf, offset, dELTA, lO, tRAILB); + (dELTA, _3, tRAILB, t256) <@ a32____aread_bcast_4subu64 (buf, + offset, dELTA, lO, tRAILB); offset <- (offset + (W64.of_int dELTA)); t256 <- (t256 `^` @@ -5641,11 +5466,9 @@ module M(SC:Syscall_t) = { } return (st, aLL, offset); } - proc nEW_KECCAK__A32____absorb_bcast_array_avx2x4 (st:W256.t Array25.t, - aT:int, - buf:W8.t Array32.t, - offset:W64.t, lEN:int, - rATE8:int, tRAILB:int) : + proc a32____absorb_bcast_array_avx2x4 (st:W256.t Array25.t, aT:int, + buf:W8.t Array32.t, offset:W64.t, + lEN:int, rATE8:int, tRAILB:int) : W256.t Array25.t * int * W64.t = { var aLL:int; var iTERS:int; @@ -5654,19 +5477,19 @@ module M(SC:Syscall_t) = { var _1:int; aLL <- (aT + lEN); if (((aT + lEN) < rATE8)) { - (st, aT, offset) <@ nEW_KECCAK__A32____addstate_bcast_array_avx2x4 ( - st, aT, buf, offset, lEN, tRAILB); + (st, aT, offset) <@ a32____addstate_bcast_array_avx2x4 (st, aT, + buf, offset, lEN, tRAILB); if ((tRAILB <> 0)) { - st <@ nEW_KECCAK____addratebit_avx2x4 (st, rATE8); + st <@ __addratebit_avx2x4 (st, rATE8); } else { } } else { if ((aT <> 0)) { - (st, _0, offset) <@ nEW_KECCAK__A32____addstate_bcast_array_avx2x4 ( - st, aT, buf, offset, (rATE8 - aT), 0); + (st, _0, offset) <@ a32____addstate_bcast_array_avx2x4 (st, + aT, buf, offset, (rATE8 - aT), 0); lEN <- (lEN - (rATE8 - aT)); - st <@ nEW_KECCAK___keccakf1600_avx2x4 (st); + st <@ _keccakf1600_avx2x4 (st); aT <- 0; } else { @@ -5674,25 +5497,24 @@ module M(SC:Syscall_t) = { iTERS <- (lEN %/ rATE8); i <- (W64.of_int 0); while ((i \ult (W64.of_int iTERS))) { - (st, _1, offset) <@ nEW_KECCAK__A32____addstate_bcast_array_avx2x4 ( - st, 0, buf, offset, rATE8, 0); - st <@ nEW_KECCAK___keccakf1600_avx2x4 (st); + (st, _1, offset) <@ a32____addstate_bcast_array_avx2x4 (st, 0, + buf, offset, rATE8, 0); + st <@ _keccakf1600_avx2x4 (st); i <- (i + (W64.of_int 1)); } lEN <- (aLL %% rATE8); - (st, aT, offset) <@ nEW_KECCAK__A32____addstate_bcast_array_avx2x4 ( - st, 0, buf, offset, lEN, tRAILB); + (st, aT, offset) <@ a32____addstate_bcast_array_avx2x4 (st, 0, + buf, offset, lEN, tRAILB); if ((tRAILB <> 0)) { - st <@ nEW_KECCAK____addratebit_avx2x4 (st, rATE8); + st <@ __addratebit_avx2x4 (st, rATE8); } else { } } return (st, aT, offset); } - proc nEW_KECCAK__A64____aread_subu64 (buf:W8.t Array64.t, offset:W64.t, - dELTA:int, lEN:int, tRAIL:int) : - int * int * int * W64.t = { + proc a64____aread_subu64 (buf:W8.t Array64.t, offset:W64.t, dELTA:int, + lEN:int, tRAIL:int) : int * int * int * W64.t = { var w:W64.t; var iLEN:int; var t16:W64.t; @@ -5757,9 +5579,8 @@ module M(SC:Syscall_t) = { } return (dELTA, lEN, tRAIL, w); } - proc nEW_KECCAK__A64____aread_subu128 (buf:W8.t Array64.t, offset:W64.t, - dELTA:int, lEN:int, tRAIL:int) : - int * int * int * W128.t = { + proc a64____aread_subu128 (buf:W8.t Array64.t, offset:W64.t, dELTA:int, + lEN:int, tRAIL:int) : int * int * int * W128.t = { var w:W128.t; var t64:W64.t; if (((lEN <= 0) /\ (tRAIL = 0))) { @@ -5779,21 +5600,20 @@ module M(SC:Syscall_t) = { (W64.to_uint (offset + (W64.of_int dELTA))))); dELTA <- (dELTA + 8); lEN <- (lEN - 8); - (dELTA, lEN, tRAIL, t64) <@ nEW_KECCAK__A64____aread_subu64 ( - buf, offset, dELTA, lEN, tRAIL); + (dELTA, lEN, tRAIL, t64) <@ a64____aread_subu64 (buf, offset, + dELTA, lEN, tRAIL); w <- (VPINSR_2u64 w t64 (W8.of_int 1)); } else { - (dELTA, lEN, tRAIL, t64) <@ nEW_KECCAK__A64____aread_subu64 ( - buf, offset, dELTA, lEN, tRAIL); + (dELTA, lEN, tRAIL, t64) <@ a64____aread_subu64 (buf, offset, + dELTA, lEN, tRAIL); w <- (zeroextu128 t64); } } } return (dELTA, lEN, tRAIL, w); } - proc nEW_KECCAK__A64____aread_subu256 (buf:W8.t Array64.t, offset:W64.t, - dELTA:int, lEN:int, tRAIL:int) : - int * int * int * W256.t = { + proc a64____aread_subu256 (buf:W8.t Array64.t, offset:W64.t, dELTA:int, + lEN:int, tRAIL:int) : int * int * int * W256.t = { var w:W256.t; var t128_1:W128.t; var t128_0:W128.t; @@ -5813,16 +5633,16 @@ module M(SC:Syscall_t) = { (W64.to_uint (offset + (W64.of_int dELTA)))); dELTA <- (dELTA + 16); lEN <- (lEN - 16); - (dELTA, lEN, tRAIL, t128_1) <@ nEW_KECCAK__A64____aread_subu128 ( - buf, offset, dELTA, lEN, tRAIL); + (dELTA, lEN, tRAIL, t128_1) <@ a64____aread_subu128 (buf, offset, + dELTA, lEN, tRAIL); w <- (W256.of_int (((W128.to_uint t128_0) %% (2 ^ 128)) + ((2 ^ 128) * (W128.to_uint t128_1)))); } else { t128_1 <- (set0_128); - (dELTA, lEN, tRAIL, t128_0) <@ nEW_KECCAK__A64____aread_subu128 ( - buf, offset, dELTA, lEN, tRAIL); + (dELTA, lEN, tRAIL, t128_0) <@ a64____aread_subu128 (buf, offset, + dELTA, lEN, tRAIL); w <- (W256.of_int (((W128.to_uint t128_0) %% (2 ^ 128)) + @@ -5832,9 +5652,8 @@ module M(SC:Syscall_t) = { } return (dELTA, lEN, tRAIL, w); } - proc nEW_KECCAK__A64____awrite_subu64 (buf:W8.t Array64.t, offset:W64.t, - dELTA:int, lEN:int, w:W64.t) : - W8.t Array64.t * int * int = { + proc a64____awrite_subu64 (buf:W8.t Array64.t, offset:W64.t, dELTA:int, + lEN:int, w:W64.t) : W8.t Array64.t * int * int = { if ((0 < lEN)) { if ((8 <= lEN)) { @@ -5887,9 +5706,8 @@ module M(SC:Syscall_t) = { } return (buf, dELTA, lEN); } - proc nEW_KECCAK__A64____awrite_subu128 (buf:W8.t Array64.t, offset:W64.t, - dELTA:int, lEN:int, w:W128.t) : - W8.t Array64.t * int * int = { + proc a64____awrite_subu128 (buf:W8.t Array64.t, offset:W64.t, dELTA:int, + lEN:int, w:W128.t) : W8.t Array64.t * int * int = { var t64:W64.t; if ((0 < lEN)) { if ((16 <= lEN)) { @@ -5915,17 +5733,16 @@ module M(SC:Syscall_t) = { } t64 <- (truncateu64 w); - (buf, dELTA, lEN) <@ nEW_KECCAK__A64____awrite_subu64 (buf, offset, - dELTA, lEN, t64); + (buf, dELTA, lEN) <@ a64____awrite_subu64 (buf, offset, dELTA, + lEN, t64); } } else { } return (buf, dELTA, lEN); } - proc nEW_KECCAK__A64____awrite_subu256 (buf:W8.t Array64.t, offset:W64.t, - dELTA:int, lEN:int, w:W256.t) : - W8.t Array64.t * int * int = { + proc a64____awrite_subu256 (buf:W8.t Array64.t, offset:W64.t, dELTA:int, + lEN:int, w:W256.t) : W8.t Array64.t * int * int = { var t128:W128.t; if ((0 < lEN)) { if ((32 <= lEN)) { @@ -5949,19 +5766,17 @@ module M(SC:Syscall_t) = { } else { t128 <- (truncateu128 w); } - (buf, dELTA, lEN) <@ nEW_KECCAK__A64____awrite_subu128 (buf, - offset, dELTA, lEN, t128); + (buf, dELTA, lEN) <@ a64____awrite_subu128 (buf, offset, dELTA, + lEN, t128); } } else { } return (buf, dELTA, lEN); } - proc nEW_KECCAK__A64____addstate_array_avx2 (st:W256.t Array7.t, - buf:W8.t Array64.t, - offset:W64.t, lEN:int, - tRAILB:int) : W256.t Array7.t * - W64.t = { + proc a64____addstate_array_avx2 (st:W256.t Array7.t, buf:W8.t Array64.t, + offset:W64.t, lEN:int, tRAILB:int) : + W256.t Array7.t * W64.t = { var dELTA:int; var t64:W64.t; var t128_0:W128.t; @@ -5974,51 +5789,49 @@ module M(SC:Syscall_t) = { var r2:W256.t; var r6:W256.t; dELTA <- 0; - (dELTA, lEN, tRAILB, t64) <@ nEW_KECCAK__A64____aread_subu64 (buf, - offset, dELTA, lEN, tRAILB); + (dELTA, lEN, tRAILB, t64) <@ a64____aread_subu64 (buf, offset, dELTA, + lEN, tRAILB); t128_0 <- (zeroextu128 t64); r0 <- (VPBROADCAST_4u64 (truncateu64 t128_0)); st.[0] <- (st.[0] `^` r0); - (dELTA, lEN, tRAILB, r1) <@ nEW_KECCAK__A64____aread_subu256 (buf, - offset, dELTA, lEN, tRAILB); + (dELTA, lEN, tRAILB, r1) <@ a64____aread_subu256 (buf, offset, dELTA, + lEN, tRAILB); st.[1] <- (st.[1] `^` r1); if ((0 < lEN)) { - (dELTA, lEN, tRAILB, t64) <@ nEW_KECCAK__A64____aread_subu64 (buf, - offset, dELTA, lEN, tRAILB); + (dELTA, lEN, tRAILB, t64) <@ a64____aread_subu64 (buf, offset, + dELTA, lEN, tRAILB); t128_1 <- (zeroextu128 t64); - (dELTA, lEN, tRAILB, r3) <@ nEW_KECCAK__A64____aread_subu256 (buf, - offset, dELTA, lEN, tRAILB); - (dELTA, lEN, tRAILB, t64) <@ nEW_KECCAK__A64____aread_subu64 (buf, - offset, dELTA, lEN, tRAILB); + (dELTA, lEN, tRAILB, r3) <@ a64____aread_subu256 (buf, offset, + dELTA, lEN, tRAILB); + (dELTA, lEN, tRAILB, t64) <@ a64____aread_subu64 (buf, offset, + dELTA, lEN, tRAILB); t128_0 <- (zeroextu128 t64); - (dELTA, lEN, tRAILB, r4) <@ nEW_KECCAK__A64____aread_subu256 (buf, - offset, dELTA, lEN, tRAILB); - (dELTA, lEN, tRAILB, t64) <@ nEW_KECCAK__A64____aread_subu64 (buf, - offset, dELTA, lEN, tRAILB); + (dELTA, lEN, tRAILB, r4) <@ a64____aread_subu256 (buf, offset, + dELTA, lEN, tRAILB); + (dELTA, lEN, tRAILB, t64) <@ a64____aread_subu64 (buf, offset, + dELTA, lEN, tRAILB); t128_1 <- (VPINSR_2u64 t128_1 t64 (W8.of_int 1)); - (dELTA, lEN, tRAILB, r5) <@ nEW_KECCAK__A64____aread_subu256 (buf, - offset, dELTA, lEN, tRAILB); - (dELTA, lEN, tRAILB, t64) <@ nEW_KECCAK__A64____aread_subu64 (buf, - offset, dELTA, lEN, tRAILB); + (dELTA, lEN, tRAILB, r5) <@ a64____aread_subu256 (buf, offset, + dELTA, lEN, tRAILB); + (dELTA, lEN, tRAILB, t64) <@ a64____aread_subu64 (buf, offset, + dELTA, lEN, tRAILB); t128_0 <- (VPINSR_2u64 t128_0 t64 (W8.of_int 1)); r2 <- (W256.of_int (((W128.to_uint t128_0) %% (2 ^ 128)) + ((2 ^ 128) * (W128.to_uint t128_1)))); st.[2] <- (st.[2] `^` r2); - (dELTA, lEN, tRAILB, r6) <@ nEW_KECCAK__A64____aread_subu256 (buf, - offset, dELTA, lEN, tRAILB); - st <@ nEW_KECCAK____addstate_r3456 (st, r3, r4, r5, r6); + (dELTA, lEN, tRAILB, r6) <@ a64____aread_subu256 (buf, offset, + dELTA, lEN, tRAILB); + st <@ __addstate_r3456 (st, r3, r4, r5, r6); } else { } offset <- (offset + (W64.of_int dELTA)); return (st, offset); } - proc nEW_KECCAK__A64____absorb_array_avx2 (st:W256.t Array7.t, - buf:W8.t Array64.t, - offset:W64.t, lEN:int, - rATE8:int, tRAILB:int) : + proc a64____absorb_array_avx2 (st:W256.t Array7.t, buf:W8.t Array64.t, + offset:W64.t, lEN:int, rATE8:int, tRAILB:int) : W256.t Array7.t * W64.t = { var aLL:int; var iTERS:int; @@ -6028,28 +5841,27 @@ module M(SC:Syscall_t) = { if ((0 < iTERS)) { i <- (W64.of_int 0); while ((i \ult (W64.of_int iTERS))) { - (st, offset) <@ nEW_KECCAK__A64____addstate_array_avx2 (st, buf, - offset, rATE8, 0); - st <@ nEW_KECCAK___keccakf1600_avx2 (st); + (st, offset) <@ a64____addstate_array_avx2 (st, buf, offset, + rATE8, 0); + st <@ _keccakf1600_avx2 (st); i <- (i + (W64.of_int 1)); } } else { } lEN <- (lEN %% rATE8); - (st, offset) <@ nEW_KECCAK__A64____addstate_array_avx2 (st, buf, - offset, lEN, tRAILB); + (st, offset) <@ a64____addstate_array_avx2 (st, buf, offset, lEN, + tRAILB); if ((tRAILB <> 0)) { - st <@ nEW_KECCAK____addratebit_avx2 (st, rATE8); + st <@ __addratebit_avx2 (st, rATE8); } else { } return (st, offset); } - proc nEW_KECCAK__A64____dumpstate_array_avx2 (buf:W8.t Array64.t, - offset:W64.t, lEN:int, - st:W256.t Array7.t) : - W8.t Array64.t * W64.t = { + proc a64____dumpstate_array_avx2 (buf:W8.t Array64.t, offset:W64.t, + lEN:int, st:W256.t Array7.t) : W8.t Array64.t * + W64.t = { var dELTA:int; var t128_0:W128.t; var t128_1:W128.t; @@ -6062,21 +5874,20 @@ module M(SC:Syscall_t) = { var _0:int; dELTA <- 0; if ((8 <= lEN)) { - (buf, dELTA, _0) <@ nEW_KECCAK__A64____awrite_subu256 (buf, offset, - dELTA, 8, st.[0]); + (buf, dELTA, _0) <@ a64____awrite_subu256 (buf, offset, dELTA, 8, + st.[0]); lEN <- (lEN - 8); } else { - (buf, dELTA, lEN) <@ nEW_KECCAK__A64____awrite_subu256 (buf, offset, - dELTA, lEN, st.[0]); + (buf, dELTA, lEN) <@ a64____awrite_subu256 (buf, offset, dELTA, + lEN, st.[0]); } - (buf, dELTA, lEN) <@ nEW_KECCAK__A64____awrite_subu256 (buf, offset, - dELTA, lEN, st.[1]); + (buf, dELTA, lEN) <@ a64____awrite_subu256 (buf, offset, dELTA, lEN, + st.[1]); if ((0 < lEN)) { t128_0 <- (truncateu128 st.[2]); t128_1 <- (VEXTRACTI128 st.[2] (W8.of_int 1)); t <- (truncateu64 t128_1); - (buf, dELTA, lEN) <@ nEW_KECCAK__A64____awrite_subu64 (buf, offset, - dELTA, lEN, t); + (buf, dELTA, lEN) <@ a64____awrite_subu64 (buf, offset, dELTA, lEN, t); t128_1 <- (VPUNPCKH_2u64 t128_1 t128_1); } else { @@ -6157,12 +5968,12 @@ module M(SC:Syscall_t) = { ((2 ^ 1) * ((0 %% (2 ^ 1)) + ((2 ^ 1) * ((1 %% (2 ^ 1)) + ((2 ^ 1) * 1)))))))))))))) )); - (buf, dELTA, lEN) <@ nEW_KECCAK__A64____awrite_subu256 (buf, offset, - dELTA, lEN, t256_4); + (buf, dELTA, lEN) <@ a64____awrite_subu256 (buf, offset, dELTA, + lEN, t256_4); if ((0 < lEN)) { t <- (truncateu64 t128_0); - (buf, dELTA, lEN) <@ nEW_KECCAK__A64____awrite_subu64 (buf, offset, - dELTA, lEN, t); + (buf, dELTA, lEN) <@ a64____awrite_subu64 (buf, offset, dELTA, + lEN, t); t128_0 <- (VPUNPCKH_2u64 t128_0 t128_0); } else { @@ -6183,15 +5994,15 @@ module M(SC:Syscall_t) = { ((2 ^ 1) * ((0 %% (2 ^ 1)) + ((2 ^ 1) * ((1 %% (2 ^ 1)) + ((2 ^ 1) * 1)))))))))))))) )); - (buf, dELTA, lEN) <@ nEW_KECCAK__A64____awrite_subu256 (buf, - offset, dELTA, lEN, t256_4); + (buf, dELTA, lEN) <@ a64____awrite_subu256 (buf, offset, dELTA, + lEN, t256_4); } else { } if ((0 < lEN)) { t <- (truncateu64 t128_1); - (buf, dELTA, lEN) <@ nEW_KECCAK__A64____awrite_subu64 (buf, offset, - dELTA, lEN, t); + (buf, dELTA, lEN) <@ a64____awrite_subu64 (buf, offset, dELTA, + lEN, t); } else { } @@ -6211,15 +6022,15 @@ module M(SC:Syscall_t) = { ((2 ^ 1) * ((0 %% (2 ^ 1)) + ((2 ^ 1) * ((1 %% (2 ^ 1)) + ((2 ^ 1) * 1)))))))))))))) )); - (buf, dELTA, lEN) <@ nEW_KECCAK__A64____awrite_subu256 (buf, - offset, dELTA, lEN, t256_4); + (buf, dELTA, lEN) <@ a64____awrite_subu256 (buf, offset, dELTA, + lEN, t256_4); } else { } if ((0 < lEN)) { t <- (truncateu64 t128_0); - (buf, dELTA, lEN) <@ nEW_KECCAK__A64____awrite_subu64 (buf, offset, - dELTA, lEN, t); + (buf, dELTA, lEN) <@ a64____awrite_subu64 (buf, offset, dELTA, + lEN, t); } else { } @@ -6239,8 +6050,8 @@ module M(SC:Syscall_t) = { ((2 ^ 1) * ((0 %% (2 ^ 1)) + ((2 ^ 1) * ((1 %% (2 ^ 1)) + ((2 ^ 1) * 1)))))))))))))) )); - (buf, dELTA, lEN) <@ nEW_KECCAK__A64____awrite_subu256 (buf, - offset, dELTA, lEN, t256_4); + (buf, dELTA, lEN) <@ a64____awrite_subu256 (buf, offset, dELTA, + lEN, t256_4); } else { } @@ -6250,10 +6061,9 @@ module M(SC:Syscall_t) = { offset <- (offset + (W64.of_int dELTA)); return (buf, offset); } - proc nEW_KECCAK__A64____squeeze_array_avx2 (buf:W8.t Array64.t, - offset:W64.t, lEN:int, - st:W256.t Array7.t, rATE8:int) : - W8.t Array64.t * W256.t Array7.t = { + proc a64____squeeze_array_avx2 (buf:W8.t Array64.t, offset:W64.t, lEN:int, + st:W256.t Array7.t, rATE8:int) : W8.t Array64.t * + W256.t Array7.t = { var iTERS:int; var lO:int; var i:W64.t; @@ -6263,18 +6073,17 @@ module M(SC:Syscall_t) = { if ((0 < iTERS)) { i <- (W64.of_int 0); while ((i \ult (W64.of_int iTERS))) { - st <@ nEW_KECCAK___keccakf1600_avx2 (st); - (buf, offset) <@ nEW_KECCAK__A64____dumpstate_array_avx2 (buf, - offset, rATE8, st); + st <@ _keccakf1600_avx2 (st); + (buf, offset) <@ a64____dumpstate_array_avx2 (buf, offset, + rATE8, st); i <- (i + (W64.of_int 1)); } } else { } if ((0 < lO)) { - st <@ nEW_KECCAK___keccakf1600_avx2 (st); - (buf, offset) <@ nEW_KECCAK__A64____dumpstate_array_avx2 (buf, - offset, lO, st); + st <@ _keccakf1600_avx2 (st); + (buf, offset) <@ a64____dumpstate_array_avx2 (buf, offset, lO, st); } else { } @@ -6283,9 +6092,8 @@ module M(SC:Syscall_t) = { } return (buf, st); } - proc nEW_KECCAK__A128____awrite_subu64 (buf:W8.t Array128.t, offset:W64.t, - dELTA:int, lEN:int, w:W64.t) : - W8.t Array128.t * int * int = { + proc a128____awrite_subu64 (buf:W8.t Array128.t, offset:W64.t, dELTA:int, + lEN:int, w:W64.t) : W8.t Array128.t * int * int = { if ((0 < lEN)) { if ((8 <= lEN)) { @@ -6338,12 +6146,11 @@ module M(SC:Syscall_t) = { } return (buf, dELTA, lEN); } - proc nEW_KECCAK__A128____dumpstate_array_avx2x4 (buf0:W8.t Array128.t, - buf1:W8.t Array128.t, - buf2:W8.t Array128.t, - buf3:W8.t Array128.t, - offset:W64.t, lEN:int, - st:W256.t Array25.t) : + proc a128____dumpstate_array_avx2x4 (buf0:W8.t Array128.t, + buf1:W8.t Array128.t, + buf2:W8.t Array128.t, + buf3:W8.t Array128.t, offset:W64.t, + lEN:int, st:W256.t Array25.t) : W8.t Array128.t * W8.t Array128.t * W8.t Array128.t * W8.t Array128.t * W64.t = { var i:W64.t; @@ -6378,7 +6185,7 @@ module M(SC:Syscall_t) = { (get256_direct (WArray800.init256 (fun i_0 => st.[i_0])) (W64.to_uint (((W64.of_int 4) * i) + (W64.of_int (3 * 32))))); i <- (i + (W64.of_int 32)); - (x0, x1, x2, x3) <@ nEW_KECCAK____4u64x4_u256x4 (x0, x1, x2, x3); + (x0, x1, x2, x3) <@ __4u64x4_u256x4 (x0, x1, x2, x3); buf0 <- (Array128.init (WArray128.get8 @@ -6441,41 +6248,36 @@ module M(SC:Syscall_t) = { t0 <- (get64_direct (WArray800.init256 (fun i_0 => st.[i_0])) (W64.to_uint (((W64.of_int 4) * i) + (W64.of_int (0 * 8))))); - (buf0, _0, _1) <@ nEW_KECCAK__A128____awrite_subu64 (buf0, offset, 0, - (lEN %% 8), t0); + (buf0, _0, _1) <@ a128____awrite_subu64 (buf0, offset, 0, (lEN %% 8), + t0); t1 <- (get64_direct (WArray800.init256 (fun i_0 => st.[i_0])) (W64.to_uint (((W64.of_int 4) * i) + (W64.of_int (1 * 8))))); - (buf1, _2, _3) <@ nEW_KECCAK__A128____awrite_subu64 (buf1, offset, 0, - (lEN %% 8), t1); + (buf1, _2, _3) <@ a128____awrite_subu64 (buf1, offset, 0, (lEN %% 8), + t1); t2 <- (get64_direct (WArray800.init256 (fun i_0 => st.[i_0])) (W64.to_uint (((W64.of_int 4) * i) + (W64.of_int (2 * 8))))); - (buf2, _4, _5) <@ nEW_KECCAK__A128____awrite_subu64 (buf2, offset, 0, - (lEN %% 8), t2); + (buf2, _4, _5) <@ a128____awrite_subu64 (buf2, offset, 0, (lEN %% 8), + t2); t3 <- (get64_direct (WArray800.init256 (fun i_0 => st.[i_0])) (W64.to_uint (((W64.of_int 4) * i) + (W64.of_int (3 * 8))))); - (buf3, _6, _7) <@ nEW_KECCAK__A128____awrite_subu64 (buf3, offset, 0, - (lEN %% 8), t3); + (buf3, _6, _7) <@ a128____awrite_subu64 (buf3, offset, 0, (lEN %% 8), + t3); offset <- (offset + (W64.of_int (lEN %% 8))); } else { } return (buf0, buf1, buf2, buf3, offset); } - proc nEW_KECCAK__A128____squeeze_array_avx2x4 (buf0:W8.t Array128.t, - buf1:W8.t Array128.t, - buf2:W8.t Array128.t, - buf3:W8.t Array128.t, - offset:W64.t, lEN:int, - st:W256.t Array25.t, - rATE8:int) : W8.t Array128.t * - W8.t Array128.t * - W8.t Array128.t * - W8.t Array128.t * - W64.t * - W256.t Array25.t = { + proc a128____squeeze_array_avx2x4 (buf0:W8.t Array128.t, + buf1:W8.t Array128.t, + buf2:W8.t Array128.t, + buf3:W8.t Array128.t, offset:W64.t, + lEN:int, st:W256.t Array25.t, rATE8:int) : + W8.t Array128.t * W8.t Array128.t * W8.t Array128.t * W8.t Array128.t * + W64.t * W256.t Array25.t = { var iTERS:int; var lO:int; var i:W64.t; @@ -6485,8 +6287,8 @@ module M(SC:Syscall_t) = { if ((0 < iTERS)) { i <- (W64.of_int 0); while ((i \ult (W64.of_int iTERS))) { - st <@ nEW_KECCAK___keccakf1600_avx2x4 (st); - (buf0, buf1, buf2, buf3, offset) <@ nEW_KECCAK__A128____dumpstate_array_avx2x4 ( + st <@ _keccakf1600_avx2x4 (st); + (buf0, buf1, buf2, buf3, offset) <@ a128____dumpstate_array_avx2x4 ( buf0, buf1, buf2, buf3, offset, rATE8, st); i <- (i + (W64.of_int 1)); } @@ -6494,8 +6296,8 @@ module M(SC:Syscall_t) = { } if ((0 < lO)) { - st <@ nEW_KECCAK___keccakf1600_avx2x4 (st); - (buf0, buf1, buf2, buf3, offset) <@ nEW_KECCAK__A128____dumpstate_array_avx2x4 ( + st <@ _keccakf1600_avx2x4 (st); + (buf0, buf1, buf2, buf3, offset) <@ a128____dumpstate_array_avx2x4 ( buf0, buf1, buf2, buf3, offset, lO, st); } else { @@ -6505,10 +6307,9 @@ module M(SC:Syscall_t) = { } return (buf0, buf1, buf2, buf3, offset, st); } - proc nEW_KECCAK__ABUFLEN____awrite_subu64 (buf:W8.t Array536.t, - offset:W64.t, dELTA:int, - lEN:int, w:W64.t) : W8.t Array536.t * - int * int = { + proc aBUFLEN____awrite_subu64 (buf:W8.t Array536.t, offset:W64.t, + dELTA:int, lEN:int, w:W64.t) : W8.t Array536.t * + int * int = { if ((0 < lEN)) { if ((8 <= lEN)) { @@ -6561,10 +6362,9 @@ module M(SC:Syscall_t) = { } return (buf, dELTA, lEN); } - proc nEW_KECCAK__ABUFLEN____awrite_subu128 (buf:W8.t Array536.t, - offset:W64.t, dELTA:int, - lEN:int, w:W128.t) : W8.t Array536.t * - int * int = { + proc aBUFLEN____awrite_subu128 (buf:W8.t Array536.t, offset:W64.t, + dELTA:int, lEN:int, w:W128.t) : W8.t Array536.t * + int * int = { var t64:W64.t; if ((0 < lEN)) { if ((16 <= lEN)) { @@ -6590,18 +6390,17 @@ module M(SC:Syscall_t) = { } t64 <- (truncateu64 w); - (buf, dELTA, lEN) <@ nEW_KECCAK__ABUFLEN____awrite_subu64 (buf, - offset, dELTA, lEN, t64); + (buf, dELTA, lEN) <@ aBUFLEN____awrite_subu64 (buf, offset, dELTA, + lEN, t64); } } else { } return (buf, dELTA, lEN); } - proc nEW_KECCAK__ABUFLEN____awrite_subu256 (buf:W8.t Array536.t, - offset:W64.t, dELTA:int, - lEN:int, w:W256.t) : W8.t Array536.t * - int * int = { + proc aBUFLEN____awrite_subu256 (buf:W8.t Array536.t, offset:W64.t, + dELTA:int, lEN:int, w:W256.t) : W8.t Array536.t * + int * int = { var t128:W128.t; if ((0 < lEN)) { if ((32 <= lEN)) { @@ -6625,17 +6424,16 @@ module M(SC:Syscall_t) = { } else { t128 <- (truncateu128 w); } - (buf, dELTA, lEN) <@ nEW_KECCAK__ABUFLEN____awrite_subu128 (buf, - offset, dELTA, lEN, t128); + (buf, dELTA, lEN) <@ aBUFLEN____awrite_subu128 (buf, offset, + dELTA, lEN, t128); } } else { } return (buf, dELTA, lEN); } - proc nEW_KECCAK__ABUFLEN____dumpstate_array_avx2 (buf:W8.t Array536.t, - offset:W64.t, lEN:int, - st:W256.t Array7.t) : + proc aBUFLEN____dumpstate_array_avx2 (buf:W8.t Array536.t, offset:W64.t, + lEN:int, st:W256.t Array7.t) : W8.t Array536.t * W64.t = { var dELTA:int; var t128_0:W128.t; @@ -6649,21 +6447,21 @@ module M(SC:Syscall_t) = { var _0:int; dELTA <- 0; if ((8 <= lEN)) { - (buf, dELTA, _0) <@ nEW_KECCAK__ABUFLEN____awrite_subu256 (buf, - offset, dELTA, 8, st.[0]); + (buf, dELTA, _0) <@ aBUFLEN____awrite_subu256 (buf, offset, dELTA, 8, + st.[0]); lEN <- (lEN - 8); } else { - (buf, dELTA, lEN) <@ nEW_KECCAK__ABUFLEN____awrite_subu256 (buf, - offset, dELTA, lEN, st.[0]); + (buf, dELTA, lEN) <@ aBUFLEN____awrite_subu256 (buf, offset, dELTA, + lEN, st.[0]); } - (buf, dELTA, lEN) <@ nEW_KECCAK__ABUFLEN____awrite_subu256 (buf, - offset, dELTA, lEN, st.[1]); + (buf, dELTA, lEN) <@ aBUFLEN____awrite_subu256 (buf, offset, dELTA, + lEN, st.[1]); if ((0 < lEN)) { t128_0 <- (truncateu128 st.[2]); t128_1 <- (VEXTRACTI128 st.[2] (W8.of_int 1)); t <- (truncateu64 t128_1); - (buf, dELTA, lEN) <@ nEW_KECCAK__ABUFLEN____awrite_subu64 (buf, - offset, dELTA, lEN, t); + (buf, dELTA, lEN) <@ aBUFLEN____awrite_subu64 (buf, offset, dELTA, + lEN, t); t128_1 <- (VPUNPCKH_2u64 t128_1 t128_1); } else { @@ -6744,12 +6542,12 @@ module M(SC:Syscall_t) = { ((2 ^ 1) * ((0 %% (2 ^ 1)) + ((2 ^ 1) * ((1 %% (2 ^ 1)) + ((2 ^ 1) * 1)))))))))))))) )); - (buf, dELTA, lEN) <@ nEW_KECCAK__ABUFLEN____awrite_subu256 (buf, - offset, dELTA, lEN, t256_4); + (buf, dELTA, lEN) <@ aBUFLEN____awrite_subu256 (buf, offset, dELTA, + lEN, t256_4); if ((0 < lEN)) { t <- (truncateu64 t128_0); - (buf, dELTA, lEN) <@ nEW_KECCAK__ABUFLEN____awrite_subu64 (buf, - offset, dELTA, lEN, t); + (buf, dELTA, lEN) <@ aBUFLEN____awrite_subu64 (buf, offset, dELTA, + lEN, t); t128_0 <- (VPUNPCKH_2u64 t128_0 t128_0); } else { @@ -6770,15 +6568,15 @@ module M(SC:Syscall_t) = { ((2 ^ 1) * ((0 %% (2 ^ 1)) + ((2 ^ 1) * ((1 %% (2 ^ 1)) + ((2 ^ 1) * 1)))))))))))))) )); - (buf, dELTA, lEN) <@ nEW_KECCAK__ABUFLEN____awrite_subu256 (buf, - offset, dELTA, lEN, t256_4); + (buf, dELTA, lEN) <@ aBUFLEN____awrite_subu256 (buf, offset, + dELTA, lEN, t256_4); } else { } if ((0 < lEN)) { t <- (truncateu64 t128_1); - (buf, dELTA, lEN) <@ nEW_KECCAK__ABUFLEN____awrite_subu64 (buf, - offset, dELTA, lEN, t); + (buf, dELTA, lEN) <@ aBUFLEN____awrite_subu64 (buf, offset, dELTA, + lEN, t); } else { } @@ -6798,15 +6596,15 @@ module M(SC:Syscall_t) = { ((2 ^ 1) * ((0 %% (2 ^ 1)) + ((2 ^ 1) * ((1 %% (2 ^ 1)) + ((2 ^ 1) * 1)))))))))))))) )); - (buf, dELTA, lEN) <@ nEW_KECCAK__ABUFLEN____awrite_subu256 (buf, - offset, dELTA, lEN, t256_4); + (buf, dELTA, lEN) <@ aBUFLEN____awrite_subu256 (buf, offset, + dELTA, lEN, t256_4); } else { } if ((0 < lEN)) { t <- (truncateu64 t128_0); - (buf, dELTA, lEN) <@ nEW_KECCAK__ABUFLEN____awrite_subu64 (buf, - offset, dELTA, lEN, t); + (buf, dELTA, lEN) <@ aBUFLEN____awrite_subu64 (buf, offset, dELTA, + lEN, t); } else { } @@ -6826,8 +6624,8 @@ module M(SC:Syscall_t) = { ((2 ^ 1) * ((0 %% (2 ^ 1)) + ((2 ^ 1) * ((1 %% (2 ^ 1)) + ((2 ^ 1) * 1)))))))))))))) )); - (buf, dELTA, lEN) <@ nEW_KECCAK__ABUFLEN____awrite_subu256 (buf, - offset, dELTA, lEN, t256_4); + (buf, dELTA, lEN) <@ aBUFLEN____awrite_subu256 (buf, offset, + dELTA, lEN, t256_4); } else { } @@ -6837,12 +6635,11 @@ module M(SC:Syscall_t) = { offset <- (offset + (W64.of_int dELTA)); return (buf, offset); } - proc nEW_KECCAK__ABUFLEN____dumpstate_array_avx2x4 (buf0:W8.t Array536.t, - buf1:W8.t Array536.t, - buf2:W8.t Array536.t, - buf3:W8.t Array536.t, - offset:W64.t, lEN:int, - st:W256.t Array25.t) : + proc aBUFLEN____dumpstate_array_avx2x4 (buf0:W8.t Array536.t, + buf1:W8.t Array536.t, + buf2:W8.t Array536.t, + buf3:W8.t Array536.t, offset:W64.t, + lEN:int, st:W256.t Array25.t) : W8.t Array536.t * W8.t Array536.t * W8.t Array536.t * W8.t Array536.t * W64.t = { var i:W64.t; @@ -6877,7 +6674,7 @@ module M(SC:Syscall_t) = { (get256_direct (WArray800.init256 (fun i_0 => st.[i_0])) (W64.to_uint (((W64.of_int 4) * i) + (W64.of_int (3 * 32))))); i <- (i + (W64.of_int 32)); - (x0, x1, x2, x3) <@ nEW_KECCAK____4u64x4_u256x4 (x0, x1, x2, x3); + (x0, x1, x2, x3) <@ __4u64x4_u256x4 (x0, x1, x2, x3); buf0 <- (Array536.init (WArray536.get8 @@ -6940,80 +6737,71 @@ module M(SC:Syscall_t) = { t0 <- (get64_direct (WArray800.init256 (fun i_0 => st.[i_0])) (W64.to_uint (((W64.of_int 4) * i) + (W64.of_int (0 * 8))))); - (buf0, _0, _1) <@ nEW_KECCAK__ABUFLEN____awrite_subu64 (buf0, - offset, 0, (lEN %% 8), t0); + (buf0, _0, _1) <@ aBUFLEN____awrite_subu64 (buf0, offset, 0, + (lEN %% 8), t0); t1 <- (get64_direct (WArray800.init256 (fun i_0 => st.[i_0])) (W64.to_uint (((W64.of_int 4) * i) + (W64.of_int (1 * 8))))); - (buf1, _2, _3) <@ nEW_KECCAK__ABUFLEN____awrite_subu64 (buf1, - offset, 0, (lEN %% 8), t1); + (buf1, _2, _3) <@ aBUFLEN____awrite_subu64 (buf1, offset, 0, + (lEN %% 8), t1); t2 <- (get64_direct (WArray800.init256 (fun i_0 => st.[i_0])) (W64.to_uint (((W64.of_int 4) * i) + (W64.of_int (2 * 8))))); - (buf2, _4, _5) <@ nEW_KECCAK__ABUFLEN____awrite_subu64 (buf2, - offset, 0, (lEN %% 8), t2); + (buf2, _4, _5) <@ aBUFLEN____awrite_subu64 (buf2, offset, 0, + (lEN %% 8), t2); t3 <- (get64_direct (WArray800.init256 (fun i_0 => st.[i_0])) (W64.to_uint (((W64.of_int 4) * i) + (W64.of_int (3 * 8))))); - (buf3, _6, _7) <@ nEW_KECCAK__ABUFLEN____awrite_subu64 (buf3, - offset, 0, (lEN %% 8), t3); + (buf3, _6, _7) <@ aBUFLEN____awrite_subu64 (buf3, offset, 0, + (lEN %% 8), t3); offset <- (offset + (W64.of_int (lEN %% 8))); } else { } return (buf0, buf1, buf2, buf3, offset); } - proc nEW_KECCAK___sha3_256A_M1184 (out:W8.t Array32.t, in_0:W64.t) : - W8.t Array32.t = { + proc _sha3_256A_M1184 (out:W8.t Array32.t, in_0:W64.t) : W8.t Array32.t = { var st:W256.t Array7.t; var offset:W64.t; var _0:W64.t; var _1:W256.t Array7.t; _1 <- witness; st <- witness; - st <@ nEW_KECCAK____state_init_avx2 (); - (st, _0) <@ nEW_KECCAK____absorb_imem_avx2 (st, in_0, 1184, 136, 6); + st <@ __state_init_avx2 (); + (st, _0) <@ __absorb_imem_avx2 (st, in_0, 1184, 136, 6); offset <- (W64.of_int 0); - (out, _1) <@ nEW_KECCAK__A32____squeeze_array_avx2 (out, offset, 32, - st, 136); + (out, _1) <@ a32____squeeze_array_avx2 (out, offset, 32, st, 136); return out; } - proc nEW_KECCAK___sha3_512A_A32 (out:W8.t Array64.t, in_0:W8.t Array32.t) : - W8.t Array64.t = { + proc _sha3_512A_A32 (out:W8.t Array64.t, in_0:W8.t Array32.t) : W8.t Array64.t = { var st:W256.t Array7.t; var offset:W64.t; var _0:W64.t; var _1:W256.t Array7.t; _1 <- witness; st <- witness; - st <@ nEW_KECCAK____state_init_avx2 (); + st <@ __state_init_avx2 (); offset <- (W64.of_int 0); - (st, _0) <@ nEW_KECCAK__A32____absorb_array_avx2 (st, in_0, offset, 32, - 72, 6); + (st, _0) <@ a32____absorb_array_avx2 (st, in_0, offset, 32, 72, 6); offset <- (W64.of_int 0); - (out, _1) <@ nEW_KECCAK__A64____squeeze_array_avx2 (out, offset, 64, - st, 72); + (out, _1) <@ a64____squeeze_array_avx2 (out, offset, 64, st, 72); return out; } - proc nEW_KECCAK___sha3_512A_A64 (out:W8.t Array64.t, in_0:W8.t Array64.t) : - W8.t Array64.t = { + proc _sha3_512A_A64 (out:W8.t Array64.t, in_0:W8.t Array64.t) : W8.t Array64.t = { var st:W256.t Array7.t; var offset:W64.t; var _0:W64.t; var _1:W256.t Array7.t; _1 <- witness; st <- witness; - st <@ nEW_KECCAK____state_init_avx2 (); + st <@ __state_init_avx2 (); offset <- (W64.of_int 0); - (st, _0) <@ nEW_KECCAK__A64____absorb_array_avx2 (st, in_0, offset, 64, - 72, 6); + (st, _0) <@ a64____absorb_array_avx2 (st, in_0, offset, 64, 72, 6); offset <- (W64.of_int 0); - (out, _1) <@ nEW_KECCAK__A64____squeeze_array_avx2 (out, offset, 64, - st, 72); + (out, _1) <@ a64____squeeze_array_avx2 (out, offset, 64, st, 72); return out; } - proc nEW_KECCAK___shake256_M32__M32_M1088 (out:W64.t, in0:W64.t, in1:W64.t) : - unit = { + proc _shake256_M32__M32_M1088 (out:W64.t, in0:W64.t, in1:W64.t) : unit = { var st:W256.t Array7.t; var pst_s:W64.t Array25.t; var pst:W64.t Array25.t; @@ -7027,22 +6815,18 @@ module M(SC:Syscall_t) = { pst <- witness; pst_s <- witness; st <- witness; - st <@ nEW_KECCAK____state_init_avx2 (); + st <@ __state_init_avx2 (); pst <- pst_s; - pst <@ nEW_KECCAK____pstate_init_avx2 (pst); - (pst, _0, st, _1) <@ nEW_KECCAK____pabsorb_imem_avx2 (pst, 0, st, - in0, 32, 136, 0); - (pst, _2, st, _3) <@ nEW_KECCAK____pabsorb_imem_avx2 (pst, 32, - st, in1, 1088, 136, 31); - ( _4, _5) <@ nEW_KECCAK____squeeze_imem_avx2 (out, 32, st, 136); + pst <@ __pstate_init_avx2 (pst); + (pst, _0, st, _1) <@ __pabsorb_imem_avx2 (pst, 0, st, in0, 32, 136, 0); + (pst, _2, st, _3) <@ __pabsorb_imem_avx2 (pst, 32, st, in1, 1088, 136, + 31); + ( _4, _5) <@ __squeeze_imem_avx2 (out, 32, st, 136); return (); } - proc nEW_KECCAK___shake256x4_A128__A32_A1 (out0:W8.t Array128.t, - out1:W8.t Array128.t, - out2:W8.t Array128.t, - out3:W8.t Array128.t, - seed:W8.t Array32.t, - nonces:W8.t Array4.t) : + proc _shake256x4_A128__A32_A1 (out0:W8.t Array128.t, out1:W8.t Array128.t, + out2:W8.t Array128.t, out3:W8.t Array128.t, + seed:W8.t Array32.t, nonces:W8.t Array4.t) : W8.t Array128.t * W8.t Array128.t * W8.t Array128.t * W8.t Array128.t = { var st_s:W256.t Array25.t; var st:W256.t Array25.t; @@ -7055,24 +6839,24 @@ module M(SC:Syscall_t) = { st <- witness; st_s <- witness; st <- st_s; - st <@ nEW_KECCAK____state_init_avx2x4 (st); + st <@ __state_init_avx2x4 (st); offset <- (W64.of_int 0); - (st, _0, _1) <@ nEW_KECCAK__A32____absorb_bcast_array_avx2x4 (st, 0, - seed, offset, 32, 136, 0); + (st, _0, _1) <@ a32____absorb_bcast_array_avx2x4 (st, 0, seed, + offset, 32, 136, 0); offset <- (W64.of_int 0); - (st, _2, _3) <@ nEW_KECCAK__A1____absorb_array_avx2x4 (st, 32, + (st, _2, _3) <@ a1____absorb_array_avx2x4 (st, 32, (Array1.init (fun i => nonces.[(0 + i)])), (Array1.init (fun i => nonces.[(1 + i)])), (Array1.init (fun i => nonces.[(2 + i)])), (Array1.init (fun i => nonces.[(3 + i)])), offset, 1, 136, 31); offset <- (W64.of_int 0); - (out0, out1, out2, out3, _4, st) <@ nEW_KECCAK__A128____squeeze_array_avx2x4 ( + (out0, out1, out2, out3, _4, st) <@ a128____squeeze_array_avx2x4 ( out0, out1, out2, out3, offset, 128, st, 136); st_s <- st; return (out0, out1, out2, out3); } - proc nEW_KECCAK___shake128_absorb_A32_A2 (seed:W8.t Array32.t, - pos:W8.t Array2.t) : W256.t Array7.t = { + proc _shake128_absorb_A32_A2 (seed:W8.t Array32.t, pos:W8.t Array2.t) : + W256.t Array7.t = { var st:W256.t Array7.t; var pst_s:W64.t Array25.t; var pst:W64.t Array25.t; @@ -7084,53 +6868,49 @@ module M(SC:Syscall_t) = { pst <- witness; pst_s <- witness; st <- witness; - st <@ nEW_KECCAK____state_init_avx2 (); + st <@ __state_init_avx2 (); pst <- pst_s; - pst <@ nEW_KECCAK____pstate_init_avx2 (pst); + pst <@ __pstate_init_avx2 (pst); offset <- (W64.of_int 0); - (pst, _0, st, _1) <@ nEW_KECCAK__A32____pabsorb_array_avx2 (pst, 0, - st, seed, offset, 32, 168, 0); + (pst, _0, st, _1) <@ a32____pabsorb_array_avx2 (pst, 0, st, seed, + offset, 32, 168, 0); offset <- (W64.of_int 0); - (pst, _2, st, _3) <@ nEW_KECCAK__A2____pabsorb_array_avx2 (pst, 32, - st, pos, offset, 2, 168, 31); + (pst, _2, st, _3) <@ a2____pabsorb_array_avx2 (pst, 32, st, pos, + offset, 2, 168, 31); return st; } - proc nEW_KECCAK___shake128x4_absorb_A32_A2 (st:W256.t Array25.t, - seed:W8.t Array32.t, - pos:W8.t Array8.t) : W256.t Array25.t = { + proc _shake128x4_absorb_A32_A2 (st:W256.t Array25.t, seed:W8.t Array32.t, + pos:W8.t Array8.t) : W256.t Array25.t = { var offset:W64.t; var aT:int; var _0:W64.t; var _1:int; var _2:W64.t; - st <@ nEW_KECCAK____state_init_avx2x4 (st); + st <@ __state_init_avx2x4 (st); offset <- (W64.of_int 0); - (st, aT, _0) <@ nEW_KECCAK__A32____absorb_bcast_array_avx2x4 (st, 0, - seed, offset, 32, 168, 0); + (st, aT, _0) <@ a32____absorb_bcast_array_avx2x4 (st, 0, seed, offset, + 32, 168, 0); offset <- (W64.of_int 0); - (st, _1, _2) <@ nEW_KECCAK__A2____absorb_array_avx2x4 (st, aT, + (st, _1, _2) <@ a2____absorb_array_avx2x4 (st, aT, (Array2.init (fun i => pos.[(0 + i)])), (Array2.init (fun i => pos.[(2 + i)])), (Array2.init (fun i => pos.[(4 + i)])), (Array2.init (fun i => pos.[(6 + i)])), offset, 2, 168, 31); return st; } - proc nEW_KECCAK___shake128_squeeze3blocks (buf:W8.t Array536.t, - st:W256.t Array7.t) : W8.t Array536.t = { + proc _shake128_squeeze3blocks (buf:W8.t Array536.t, st:W256.t Array7.t) : + W8.t Array536.t = { var offset:W64.t; - st <@ nEW_KECCAK___keccakf1600_avx2 (st); + st <@ _keccakf1600_avx2 (st); offset <- (W64.of_int 0); - (buf, offset) <@ nEW_KECCAK__ABUFLEN____dumpstate_array_avx2 (buf, - offset, 168, st); - st <@ nEW_KECCAK___keccakf1600_avx2 (st); - (buf, offset) <@ nEW_KECCAK__ABUFLEN____dumpstate_array_avx2 (buf, - offset, 168, st); - st <@ nEW_KECCAK___keccakf1600_avx2 (st); - (buf, offset) <@ nEW_KECCAK__ABUFLEN____dumpstate_array_avx2 (buf, - offset, 200, st); + (buf, offset) <@ aBUFLEN____dumpstate_array_avx2 (buf, offset, 168, st); + st <@ _keccakf1600_avx2 (st); + (buf, offset) <@ aBUFLEN____dumpstate_array_avx2 (buf, offset, 168, st); + st <@ _keccakf1600_avx2 (st); + (buf, offset) <@ aBUFLEN____dumpstate_array_avx2 (buf, offset, 200, st); return buf; } - proc nEW_KECCAK___shake128_next_state (buf:W8.t Array536.t) : W8.t Array536.t = { + proc _shake128_next_state (buf:W8.t Array536.t) : W8.t Array536.t = { var pst:W64.t Array25.t; var st:W256.t Array7.t; var offset:W64.t; @@ -7142,15 +6922,13 @@ module M(SC:Syscall_t) = { (fun i => (get64 (WArray536.init8 (fun i => buf.[i])) ((2 * (168 %/ 8)) + i))) ); - st <@ nEW_KECCAK____state_from_pstate_avx2 (pst); - st <@ nEW_KECCAK___keccakf1600_avx2 (st); + st <@ __state_from_pstate_avx2 (pst); + st <@ _keccakf1600_avx2 (st); offset <- (W64.of_int (2 * 168)); - (buf, _0) <@ nEW_KECCAK__ABUFLEN____dumpstate_array_avx2 (buf, offset, - 200, st); + (buf, _0) <@ aBUFLEN____dumpstate_array_avx2 (buf, offset, 200, st); return buf; } - proc nEW_KECCAK___shake128x4_squeeze3blocks (st:W256.t Array25.t, - buf:W8.t Array2144.t) : + proc _shake128x4_squeeze3blocks (st:W256.t Array25.t, buf:W8.t Array2144.t) : W256.t Array25.t * W8.t Array2144.t = { var buf0:W8.t Array536.t; var buf1:W8.t Array536.t; @@ -7166,14 +6944,14 @@ module M(SC:Syscall_t) = { buf2 <- (Array536.init (fun i => buf.[((2 * 536) + i)])); buf3 <- (Array536.init (fun i => buf.[((3 * 536) + i)])); offset <- (W64.of_int 0); - st <@ nEW_KECCAK___keccakf1600_avx2x4 (st); - (buf0, buf1, buf2, buf3, offset) <@ nEW_KECCAK__ABUFLEN____dumpstate_array_avx2x4 ( + st <@ _keccakf1600_avx2x4 (st); + (buf0, buf1, buf2, buf3, offset) <@ aBUFLEN____dumpstate_array_avx2x4 ( buf0, buf1, buf2, buf3, offset, 168, st); - st <@ nEW_KECCAK___keccakf1600_avx2x4 (st); - (buf0, buf1, buf2, buf3, offset) <@ nEW_KECCAK__ABUFLEN____dumpstate_array_avx2x4 ( + st <@ _keccakf1600_avx2x4 (st); + (buf0, buf1, buf2, buf3, offset) <@ aBUFLEN____dumpstate_array_avx2x4 ( buf0, buf1, buf2, buf3, offset, 168, st); - st <@ nEW_KECCAK___keccakf1600_avx2x4 (st); - (buf0, buf1, buf2, buf3, offset) <@ nEW_KECCAK__ABUFLEN____dumpstate_array_avx2x4 ( + st <@ _keccakf1600_avx2x4 (st); + (buf0, buf1, buf2, buf3, offset) <@ aBUFLEN____dumpstate_array_avx2x4 ( buf0, buf1, buf2, buf3, offset, 200, st); buf <- (Array2144.init @@ -7201,65 +6979,6 @@ module M(SC:Syscall_t) = { ); return (st, buf); } - proc _sha3_256A_M1184 (out:W8.t Array32.t, in_0:W64.t) : W8.t Array32.t = { - - out <@ nEW_KECCAK___sha3_256A_M1184 (out, in_0); - return out; - } - proc _shake256_M32__M32_M1088 (out:W64.t, in0:W64.t, in1:W64.t) : unit = { - - nEW_KECCAK___shake256_M32__M32_M1088 (out, in0, in1); - return (); - } - proc _shake256x4_A128__A32_A1 (out0:W8.t Array128.t, out1:W8.t Array128.t, - out2:W8.t Array128.t, out3:W8.t Array128.t, - seed:W8.t Array32.t, nonces:W8.t Array4.t) : - W8.t Array128.t * W8.t Array128.t * W8.t Array128.t * W8.t Array128.t = { - - (out0, out1, out2, out3) <@ nEW_KECCAK___shake256x4_A128__A32_A1 ( - out0, out1, out2, out3, seed, nonces); - return (out0, out1, out2, out3); - } - proc _sha3_512A_A64 (out:W8.t Array64.t, in_0:W8.t Array64.t) : W8.t Array64.t = { - - out <@ nEW_KECCAK___sha3_512A_A64 (out, in_0); - return out; - } - proc _sha3_512A_A32 (out:W8.t Array64.t, in_0:W8.t Array32.t) : W8.t Array64.t = { - - out <@ nEW_KECCAK___sha3_512A_A32 (out, in_0); - return out; - } - proc _shake128_absorb_A32_A2 (seed:W8.t Array32.t, pos:W8.t Array2.t) : - W256.t Array7.t = { - var st:W256.t Array7.t; - st <- witness; - st <@ nEW_KECCAK___shake128_absorb_A32_A2 (seed, pos); - return st; - } - proc _shake128x4_absorb_A32_A2 (st:W256.t Array25.t, seed:W8.t Array32.t, - pos:W8.t Array8.t) : W256.t Array25.t = { - - st <@ nEW_KECCAK___shake128x4_absorb_A32_A2 (st, seed, pos); - return st; - } - proc _shake128_squeeze3blocks (buf:W8.t Array536.t, st:W256.t Array7.t) : - W8.t Array536.t = { - - buf <@ nEW_KECCAK___shake128_squeeze3blocks (buf, st); - return buf; - } - proc _shake128_next_state (buf:W8.t Array536.t) : W8.t Array536.t = { - - buf <@ nEW_KECCAK___shake128_next_state (buf); - return buf; - } - proc _shake128x4_squeeze3blocks (st:W256.t Array25.t, buf:W8.t Array2144.t) : - W256.t Array25.t * W8.t Array2144.t = { - - (st, buf) <@ nEW_KECCAK___shake128x4_squeeze3blocks (st, buf); - return (st, buf); - } proc _poly_add2 (rp:W16.t Array256.t, bp:W16.t Array256.t) : W16.t Array256.t = { var aux:int; var i:int; diff --git a/code/jasmin/mlkem_avx2/gen_matrix.jinc b/code/jasmin/mlkem_avx2/gen_matrix.jinc index 637f1784..19be3db9 100644 --- a/code/jasmin/mlkem_avx2/gen_matrix.jinc +++ b/code/jasmin/mlkem_avx2/gen_matrix.jinc @@ -1,9 +1,4 @@ -/* // OLD INTERFACE -require "keccak/keccakf1600x4_avx2.jinc" -require "keccak/keccakf1600_avx2.jinc" -*/ -// NEW INTERFACE -require "mlkem_keccak_avx2_TRANSITION.jinc" +require "mlkem_keccak_avx2.jinc" require "params.jinc" diff --git a/code/jasmin/mlkem_avx2/keccak/keccak1600_array_avx2_ASIZE.jinc b/code/jasmin/mlkem_avx2/keccak/keccak1600_array_avx2_ASIZE.jinc deleted file mode 100644 index 9e3cfc5a..00000000 --- a/code/jasmin/mlkem_avx2/keccak/keccak1600_array_avx2_ASIZE.jinc +++ /dev/null @@ -1,384 +0,0 @@ -/* DEPENDENCIES -require "keccak1600_avx2.jinc" -param int ASIZE = 1002; -*/ - -require "subreadwrite_array_ASIZE.jinc" - -/* - ONE-SHOT (FIXED-SIZE) ARRAY ABSORB - ================================== -*/ - -inline fn __addstate_array_avx2 -( reg u256[7] st -, reg const ptr u8[ASIZE] buf -, reg u64 offset -, inline int LEN -, inline int TRAILB -) -> reg ptr u256[7] /* st */ - , reg u64 /* offset */ -{ - reg u64 t64; - reg u256 r0, r1, r2, r3, r4, r5, r6; - reg u128 t128_0, t128_1; - inline int DELTA; - DELTA = 0; - - DELTA, LEN, TRAILB, t64 = __aread_subu64(buf, offset, DELTA, LEN, TRAILB); - t128_0 = (128u) t64; - r0 = #VPBROADCAST_4u64(t128_0); - st[0] ^= r0; - - DELTA, LEN, TRAILB, r1 = __aread_subu256(buf, offset, DELTA, LEN, TRAILB); - st[1] ^= r1; - - if (0 < LEN ) { - DELTA, LEN, TRAILB, t64 = __aread_subu64(buf,offset, DELTA, LEN, TRAILB); - t128_1 = (128u) t64; - - DELTA, LEN, TRAILB, r3 = __aread_subu256(buf, offset, DELTA, LEN, TRAILB); - - DELTA, LEN, TRAILB, t64 = __aread_subu64(buf, offset, DELTA, LEN, TRAILB); - t128_0 = (128u) t64; - - DELTA, LEN, TRAILB, r4 = __aread_subu256(buf, offset, DELTA, LEN, TRAILB); - - DELTA, LEN, TRAILB, t64 = __aread_subu64(buf, offset, DELTA, LEN, TRAILB); - t128_1 = #VPINSR_2u64(t128_1, t64, 1); - - DELTA, LEN, TRAILB, r5 = __aread_subu256(buf, offset, DELTA, LEN, TRAILB); - - DELTA, LEN, TRAILB, t64 = __aread_subu64(buf, offset, DELTA, LEN, TRAILB); - t128_0 = #VPINSR_2u64(t128_0, t64, 1); - r2 = (2u128)[t128_1, t128_0]; - st[2] ^= r2; - - DELTA, LEN, TRAILB, r6 = __aread_subu256(buf, offset, DELTA, LEN, TRAILB); - - st = __addstate_r3456( st, r3, r4, r5, r6); - } - offset += DELTA; - return st, offset; -} - -inline fn __absorb_array_avx2 -( reg u256[7] st -, reg const ptr u8[ASIZE] buf -, reg u64 offset -, inline int LEN -, inline int RATE8 -, inline int TRAILB /* closes state if !=0 (i.e. adds trailbyte and padding) */ -) -> reg u256[7] /* st */ - , reg u64 /* offset */ -{ - reg u64 i; - inline int ALL, ITERS; - - ALL = LEN + (TRAILB!=0 ? 1 : 0); - - // continue by processing full blocks - ITERS = LEN / RATE8; // number of full blocks - if (0 < ITERS) { - i = 0; - while ( i < ITERS ) { - st, offset = __addstate_array_avx2(st, buf, offset, RATE8, 0); - st = _keccakf1600_avx2(st); - i += 1; - } - } - - // last incomplete block - LEN = LEN % RATE8; - st, offset = __addstate_array_avx2(st, buf, offset, LEN, TRAILB); - if (TRAILB!=0) { st = __addratebit_avx2(st, RATE8); } - - return st, offset; -} - -/* - INCREMENTAL (FIXED-SIZE) MEMORY ABSORB - ====================================== -*/ - -inline fn __pstate_array_avx2 -( reg mut ptr u64[25] pst -, inline int AT /* bytes (0 <= AT < 200) */ -, reg const ptr u8[ASIZE] buf -, reg u64 offset -, inline int LEN -, inline int TRAILB -) -> reg ptr u64[25] /* pst */ - , inline int /* AT */ - , reg u64 /* offset */ -{ - inline int DELTA, LO, ALL; - reg u64 at, t64; - reg u256 t256; - reg u128 t128; - - DELTA = 0; - ALL = AT+LEN; // total bytes to process (excluding trail byte, if !=0) - LO = AT % 8; // leftover bytes - at = AT / 8; // current pstate position - - if ( 0 < LO ) { // process first word... - if ( LO+LEN < 8) { // ...not enough to fill a word (just update it) - if ( TRAILB != 0 ) { ALL += 1; } - DELTA, _, TRAILB, t64 = __aread_subu64(buf, offset, DELTA, LEN, TRAILB); - t64 <<= 8*LO; - pst[(int) at] ^= t64; - LO = 0; - AT = 0; - LEN = 0; - } else { // process first word - if ( 8 <= LEN ) { - t64 = buf.[u64 offset + DELTA]; - DELTA += (8-LO); - } else { - DELTA, _, _, t64 = __aread_subu64(buf, offset, DELTA, 8-LO, 0); - } - LEN -= 8-LO; - AT += 8-LO; - t64 <<= 8*LO; - pst[(int) at] ^= t64; - at += 1; - } - } - - // continue processing remaining bytes - if (32 <= LEN) { - offset += DELTA; - DELTA = 0; - while ( at < AT/8+4*(LEN/32)) { - t256 = buf.[u256 offset]; - offset += 32; - pst.[u256 8*at] = t256; - at += 4; - } - LEN = LEN % 32; - } - if (16 <= LEN) { - t128 = buf.[u128 offset + DELTA]; - DELTA += 16; - pst.[u128 8*at] = t128; - at += 2; - LEN -= 16; - } - if (8 <= LEN) { - t64 = buf.[u64 offset + DELTA]; - DELTA += 8; - pst.[u64 8*at] = t64; - at += 1; - LEN -= 8; - } - - // process last word (possibly closing the state) - LO = (AT+LEN) % 8; - if ( 0 < LO || TRAILB != 0 ) { - if ( TRAILB != 0 ) { ALL += 1; } - DELTA, _, TRAILB, t64 = __aread_subu64(buf, offset, DELTA, LO, TRAILB); - pst[u64 (ALL/8)] = t64; - } - offset += DELTA; - return pst, ALL, offset; -} - -inline fn __pabsorb_array_avx2 -( reg mut ptr u64[25] pst -, inline int AT -, reg u256[7] st -, reg const ptr u8[ASIZE] buf -, reg u64 offset -, inline int LEN -, inline int RATE8 -, inline int TRAILB /* closes state if !=0 (i.e. adds trailbyte and padding) */ -) -> reg ptr u64[25] /* pst */ - , inline int /* AT */ - , reg u256[7] /* st */ - , reg u64 /* offset */ -{ - reg u64 i; - inline int ALL, ITERS; - - ALL = AT + LEN; - if ( (AT+LEN) < RATE8 ) { // not enough to fill a block! - pst, AT, offset = __pstate_array_avx2(pst, AT, buf, offset, LEN, TRAILB); - if (TRAILB != 0) { // add pstate and closes the state - i = AT/8 + 1; - if (AT <= 5*8) { // only st[0..1] is affected - while (i < 5) { pst[i] = 0; i += 1; } - st = __addpst01(st, pst); - st = __addratebit_avx2(st, RATE8); - } else { // all state is affected - while (i < RATE8/8) { pst[i] = 0; i += 1; } - pst[u8 RATE8-1] ^= 0x80; - st = _addpstate_avx2(st, pst); - } - } - } else { // at least a block is filled - if ( AT != 0 ) { // start by filling the first block - pst, _, offset = __pstate_array_avx2(pst, AT, buf, offset, RATE8-AT, 0); - LEN = LEN - (RATE8-AT); - st = _addpstate_avx2(st, pst); - st = _keccakf1600_avx2(st); - AT = 0; - } - - // continue by processing full blocks - ITERS = LEN / RATE8; // number of full blocks - i = 0; - while ( i < ITERS ) { - st, offset = __addstate_array_avx2(st, buf, offset, RATE8, 0); - st = _keccakf1600_avx2(st); - i += 1; - } - - // last incomplete block - LEN = ALL % RATE8; - if (TRAILB!=0) { - st, offset = __addstate_array_avx2(st, buf, offset, LEN, TRAILB); - st = __addratebit_avx2(st, RATE8); - } else if ( LEN != 0) { - pst, AT, offset = __pstate_array_avx2(pst, 0, buf, offset, LEN, TRAILB); -/* - if (TRAILB != 0) { // add pstate and closes the state - i = AT/8 + 1; - if (AT <= 5*8) { // only st[0..1] is affected - while (i < 5) { pst[i] = 0; i += 1; } - st = __addpst01(st, pst); - st = __addratebit_avx2(st, RATE8); - } else { // all state is affected - while (i < RATE8/8) { pst[i] = 0; i += 1; } - pst[u8 RATE8-1] ^= 0x80; - st = _addpstate_avx2(st, pst); - } - } -*/ - } - } - return pst, AT, st, offset; -} - -/* - ONE-SHOT (FIXED-SIZE) MEMORY SQUEEZE - ==================================== -*/ - -inline fn __dumpstate_array_avx2 -( reg mut ptr u8[ASIZE] buf -, reg u64 offset -, inline int LEN -, reg u256[7] st -) -> reg ptr u8[ASIZE] /* buf */ - , reg u64 /* offset */ -{ - reg u64 t; - reg u128 t128_0, t128_1; - reg u256 t256_0, t256_1, t256_2, t256_3, t256_4; - inline int DELTA; - - DELTA = 0; - - // reg0 - if (8 <= LEN) { - buf, DELTA, _ = __awrite_subu256(buf, offset, DELTA, 8, st[0]); - LEN -= 8; - } else { - buf, DELTA, LEN = __awrite_subu256(buf, offset, DELTA, LEN, st[0]); - } - - // reg1 - buf, DELTA, LEN = __awrite_subu256(buf, offset, DELTA, LEN, st[1]); - - // reg2 (5) - if (0 reg ptr u8[ASIZE] /* buf */ - , reg u256[7] /* st */ -{ - reg u64 i; - inline int ITERS, LO; - ITERS = LEN/RATE8; - LO = LEN%RATE8; - if (0 reg u256[7] -{ - inline int i; - reg u256[7] st; - - for i=0 to 7 { st[i] = #set0_256(); } - - return st; -} - -/* - PSTATE - UNPERMUTED KECCAK STATE - ================================ -*/ -inline fn __pstate_init_avx2 -( reg mut ptr u64[25] pst -) -> reg ptr u64[25] -{ - inline int i; - reg u64 z64; - reg u256 z256; - - z256 = #set0_256(); - for i=0 to 25/4 { pst[u256 i] = z256; } - z64 = 0; - pst[24] = z64; - - return pst; -} - -inline fn __perm_reg3456_avx2 -( reg u256 r3 r4 r5 r6 -) -> reg u256 /* st[3] */ - , reg u256 /* st[4] */ - , reg u256 /* st[5] */ - , reg u256 /* st[6] */ -{ - reg u256 t256_0, t256_1, t256_2; - reg u256 st3, st4, st5, st6; - // [ 16 7 8 19 ] - t256_0 = #VPBLEND_8u32(r3, r5, (8u1)[1,1,0,0,0,0,1,1]); - // [ 11 22 23 14 ] - t256_1 = #VPBLEND_8u32(r6, r4, (8u1)[1,1,0,0,0,0,1,1]); - // [ 6 12 13 9 ] - t256_2 = #VPBLEND_8u32(r4, r3, (8u1)[1,1,0,0,0,0,1,1]); - // [ 16 7 23 14 ] - st3 = #VPBLEND_8u32(t256_0, t256_1, (8u1)[1,1,1,1,0,0,0,0]); - // [ 11 22 8 19 ] - st4 = #VPBLEND_8u32(t256_1, t256_0, (8u1)[1,1,1,1,0,0,0,0]); - // [ 21 17 18 24 ] - t256_0 = #VPBLEND_8u32(r5, r6, (8u1)[1,1,0,0,0,0,1,1]); - // [ 21 17 13 9 ] - st5 = #VPBLEND_8u32(t256_0, t256_2, (8u1)[1,1,1,1,0,0,0,0]); - // [ 6 12 18 24 ] - st6 = #VPBLEND_8u32(t256_2, t256_0, (8u1)[1,1,1,1,0,0,0,0]); - - return st3, st4, st5, st6; -} - -inline fn __unperm_reg3456_avx2 -( reg u256 st3 st4 st5 st6 -) -> reg u256 /* r3 */ - , reg u256 /* r4 */ - , reg u256 /* r5 */ - , reg u256 /* r6 */ -{ - reg u256 t256_0, t256_1, t256_2, t256_3; - reg u256 r3, r4, r5, r6; - // [ 16, 7, 8, 19 ] - t256_0 = #VPBLEND_8u32(st3, st4, (8u1)[1,1,1,1,0,0,0,0]); - // [ 11, 22, 23, 14 ] - t256_1 = #VPBLEND_8u32(st4, st3, (8u1)[1,1,1,1,0,0,0,0]); - // [ 21, 17, 18, 24 ] - t256_2 = #VPBLEND_8u32(st5, st6, (8u1)[1,1,1,1,0,0,0,0]); - // [ 6, 12, 13, 9 ] - t256_3 = #VPBLEND_8u32(st6, st5, (8u1)[1,1,1,1,0,0,0,0]); - // [ 6, 7, 8, 9 ] - r3 = #VPBLEND_8u32(t256_0, t256_3, (8u1)[1,1,0,0,0,0,1,1]); - // [ 11, 12, 13, 14 ] - r4 = #VPBLEND_8u32(t256_3, t256_1, (8u1)[1,1,0,0,0,0,1,1]); - // [ 16, 17, 18, 19 ] - r5 = #VPBLEND_8u32(t256_2, t256_0, (8u1)[1,1,0,0,0,0,1,1]); - // [ 21, 22, 23, 24 ] - r6 = #VPBLEND_8u32(t256_1, t256_2, (8u1)[1,1,0,0,0,0,1,1]); - - return r3, r4, r5, r6; -} - -/* - STATE READ - ========== -*/ -inline fn __state_from_pstate_avx2 -( reg const ptr u64[25] pst -) -> reg u256[7] -{ - reg u256[7] st; - reg u128 t128_0, t128_1; - reg u64 t; - - st[0] = #VPBROADCAST_4u64(pst.[u64 0]); - st[1] = pst.[u256 8]; - - // [ 5 - ] - t128_0 = #VMOV(pst.[u64 5*8]); - // [ 6 7 8 9 ] - st[3] = pst.[u256 6*8]; - // [ 10 - ] - t128_1 = #VMOV(pst.[u64 10*8]); - // [ 11 12 13 14 ] - st[4] = pst.[u256 11*8]; - // [ 5 15 ] - t = pst.[u64 15*8]; - t128_0 = #VPINSR_2u64(t128_0, t, 1); - // [ 16 17 18 19 ] - st[5] = pst.[u256 16*8]; - // [ 10 20 ] - t = pst.[u64 20*8]; - t128_1 = #VPINSR_2u64(t128_1, t, 1); - // [ 10 20 5 15 ] - st[2] = (2u128)[t128_0, t128_1]; - // [ 21 22 23 24 ] - st[6] = pst.[u256 21*8]; - st[3], st[4], st[5], st[6] = __perm_reg3456_avx2(st[3], st[4], st[5], st[6]); - - return st; -} - - -inline fn __addstate_r3456 -( reg u256[7] st -, reg u256 r3 r4 r5 r6 -) -> reg u256[7] -{ - r3, r4, r5, r6 = __perm_reg3456_avx2(r3, r4, r5, r6); - st[3] ^= r3; - st[4] ^= r4; - st[5] ^= r5; - st[6] ^= r6; -/* - reg u256 t256_0, t256_1, t256_2; - // [ 16 7 8 19 ] - t256_0 = #VPBLEND_8u32(r3, r5, (8u1)[1,1,0,0,0,0,1,1]); - // [ 11 22 23 14 ] - t256_1 = #VPBLEND_8u32(r6, r4, (8u1)[1,1,0,0,0,0,1,1]); - // [ 6 12 13 9 ] - t256_2 = #VPBLEND_8u32(r4, r3, (8u1)[1,1,0,0,0,0,1,1]); - // [ 16 7 23 14 ] - r3 = #VPBLEND_8u32(t256_0, t256_1, (8u1)[1,1,1,1,0,0,0,0]); - st[3] ^= r3; - // [ 11 22 8 19 ] - r4 = #VPBLEND_8u32(t256_1, t256_0, (8u1)[1,1,1,1,0,0,0,0]); - st[4] ^= r4; - // [ 21 17 18 24 ] - t256_0 = #VPBLEND_8u32(r5, r6, (8u1)[1,1,0,0,0,0,1,1]); - // [ 21 17 13 9 ] - r5 = #VPBLEND_8u32(t256_0, t256_2, (8u1)[1,1,1,1,0,0,0,0]); - st[5] ^= r5; - // [ 6 12 18 24 ] - r6 = #VPBLEND_8u32(t256_2, t256_0, (8u1)[1,1,1,1,0,0,0,0]); - st[6] ^= r6; -*/ - return st; -} - -inline fn __addpst01 -( reg u256[7] st -, reg const ptr u64[25] pst -) -> reg u256[7] -{ - reg u256 t256; - t256 = #VPBROADCAST_4u64(pst.[u64 0]); - st[0] ^= t256; - t256 = pst.[u256 8*1]; - st[1] ^= t256; - return st; -} - -inline fn __addpst23456 // remaining entries -( reg u256[7] st -, reg const ptr u64[25] pst -) -> reg u256[7] -{ - reg u256 r2, r3, r4, r5, r6; - reg u128 t128_0, t128_1; - reg u64 t; - - // [ 5 - ] - t128_0 = #VMOV(pst.[u64 5*8]); - // [ 6 7 8 9 ] - r3 = pst.[u256 6*8]; - // [ 10 - ] - t128_1 = #VMOV(pst.[u64 10*8]); - // [ 11 12 13 14 ] - r4 = pst.[u256 11*8]; - // [ 5 15 ] - t = pst.[u64 15*8]; - t128_0 = #VPINSR_2u64(t128_0, t, 1); - // [ 16 17 18 19 ] - r5 = pst.[u256 16*8]; - // [ 10 20 ] - t = pst.[u64 20*8]; - t128_1 = #VPINSR_2u64(t128_1, t, 1); - // [ 10 20 5 15 ] - r2 = (2u128)[t128_0, t128_1]; - st[2] ^= r2; - // [ 21 22 23 24 ] - r6 = pst.[u256 21*8]; - - st = __addstate_r3456(st, r3, r4, r5, r6); - - return st; -} - -fn _addpstate_avx2 -( reg u256[7] st -, reg const ptr u64[25] pst -) -> reg u256[7] -{ - st = __addpst01(st, pst); - st = __addpst23456(st, pst); - return st; -} - -/* - ADD RATE BIT - ============ -*/ - -inline fn __stavx2_pos(inline int POS) -> inline int, inline int { - inline int R, L; - //0: [ 0 0 0 0 ] - R = 0; L = 0; - if (0 < POS) { - //1: [ 1 2 3 4 ] - if (POS <= 4) { R = 1; L = POS-1; } - //2: [ 10 20 5 15 ] - else if (POS == 10) { R = 2; L = 0; } - else if (POS == 20) { R = 2; L = 1; } - else if (POS == 5 ) { R = 2; L = 2; } - else if (POS == 15) { R = 2; L = 3; } - //3: [ 16 7 23 14 ] - else if (POS == 16) { R = 3; L = 0; } - else if (POS == 7 ) { R = 3; L = 1; } - else if (POS == 23) { R = 3; L = 2; } - else if (POS == 14) { R = 3; L = 3; } - //4: [ 11 22 8 19 ] - else if (POS == 11) { R = 4; L = 0; } - else if (POS == 22) { R = 4; L = 1; } - else if (POS == 8 ) { R = 4; L = 2; } - else if (POS == 19) { R = 4; L = 3; } - //5: [ 21 17 13 9 ] - else if (POS == 21) { R = 5; L = 0; } - else if (POS == 17) { R = 5; L = 1; } - else if (POS == 13) { R = 5; L = 2; } - else if (POS == 9 ) { R = 5; L = 3; } - //6: [ 6 12 18 24 ] - else if (POS == 6 ) { R = 6; L = 0; } - else if (POS == 12) { R = 6; L = 1; } - else if (POS == 18) { R = 6; L = 2; } - else if (POS == 24) { R = 6; L = 3; } - } - return R,L; -} - -inline fn __u64_to_u256 -( reg u64 x -, inline int L -) -> reg u256 -{ - reg u256 t256; - reg u128 t128; - - if (L % 2 == 0) { - t128 = (128u) x; - } else { - t128 = #set0_128(); - t128 = #VPINSR_2u64(t128, x, 1); - } - t256 = #set0_256(); - if (L / 2 == 0) { - t256 = #VINSERTI128(t256, t128, 0); - } else { - t256 = #VINSERTI128(t256, t128, 1); - } - - return t256; -} - -inline fn __addratebit_avx2 -( reg u256[7] st -, inline int RATE8 -) -> reg ptr u256[7] -{ - inline int R, L; - reg u256 t256; - - reg u64 t64; - t64 = 1; - t64 <<= (8*RATE8-1) % 64; // obs: should be 63 for all admissible rates! - R,L = __stavx2_pos((RATE8-1)/8); - if (R==0) { - t256 = #VPBROADCAST_4u64(t64); - } else { - t256 = __u64_to_u256(t64, L); - } - st[R] ^= t256; - return st; -} - diff --git a/code/jasmin/mlkem_avx2/keccak/keccak1600_globals.jinc b/code/jasmin/mlkem_avx2/keccak/keccak1600_globals.jinc deleted file mode 100644 index 290d1693..00000000 --- a/code/jasmin/mlkem_avx2/keccak/keccak1600_globals.jinc +++ /dev/null @@ -1,11 +0,0 @@ -param int R72 = 72; -param int R104 = 104; -param int R136 = 136; -param int R144 = 144; -param int R168 = 168; - -param int UNFINISHED = 0; -param int SHA3 = 0x06; -param int RAWSHAKE = 0x07; -param int SHAKE = 0x1F; - diff --git a/code/jasmin/mlkem_avx2/keccak/keccak1600_imem_avx2.jinc b/code/jasmin/mlkem_avx2/keccak/keccak1600_imem_avx2.jinc deleted file mode 100644 index e3e6ff99..00000000 --- a/code/jasmin/mlkem_avx2/keccak/keccak1600_imem_avx2.jinc +++ /dev/null @@ -1,349 +0,0 @@ -require "keccak1600_avx2.jinc" - -/* - ONE-SHOT (FIXED-SIZE) MEMORY ABSORB - =================================== -*/ - -inline fn __addstate_imem_avx2 -( reg u256[7] st -, reg u64 buf -, inline int LEN -, inline int TRAILB -) -> reg ptr u256[7] /* st */ - , reg u64 /* buf */ -{ - reg u64 t64; - reg u256 r0, r1, r2, r3, r4, r5, r6; - reg u128 t128_0, t128_1; - - buf, LEN, TRAILB, r0 = __mread_bcast_4subu64(buf, LEN, TRAILB); - st[0] ^= r0; - - buf, LEN, TRAILB, r1 = __mread_subu256(buf, LEN, TRAILB); - st[1] ^= r1; - - if (0 < LEN ) { - buf, LEN, TRAILB, t64 = __mread_subu64(buf, LEN, TRAILB); - t128_1 = (128u) t64; - - buf, LEN, TRAILB, r3 = __mread_subu256(buf, LEN, TRAILB); - - buf, LEN, TRAILB, t64 = __mread_subu64(buf, LEN, TRAILB); - t128_0 = (128u) t64; - - buf, LEN, TRAILB, r4 = __mread_subu256(buf, LEN, TRAILB); - - buf, LEN, TRAILB, t64 = __mread_subu64(buf, LEN, TRAILB); - t128_1 = #VPINSR_2u64(t128_1, t64, 1); - - buf, LEN, TRAILB, r5 = __mread_subu256(buf, LEN, TRAILB); - - buf, LEN, TRAILB, t64 = __mread_subu64(buf, LEN, TRAILB); - t128_0 = #VPINSR_2u64(t128_0, t64, 1); - r2 = (2u128)[t128_1, t128_0]; - st[2] ^= r2; - - buf, LEN, TRAILB, r6 = __mread_subu256(buf, LEN, TRAILB); - - st = __addstate_r3456( st, r3, r4, r5, r6); - } - return st, buf; -} - -inline fn __absorb_imem_avx2 -( reg u256[7] st -, reg u64 buf -, inline int LEN -, inline int RATE8 -, inline int TRAILB /* closes state if !=0 (i.e. adds trailbyte and padding) */ -) -> reg u256[7] /* st */ - , reg u64 /* buf */ -{ - reg u64 i; - inline int ALL, ITERS; - - ALL = LEN + (TRAILB!=0 ? 1 : 0); - - // continue by processing full blocks - ITERS = LEN / RATE8; // number of full blocks - if (0 < ITERS) { - i = 0; - while ( i < ITERS ) { - st, buf = __addstate_imem_avx2(st, buf, RATE8, 0); - st = _keccakf1600_avx2(st); - i += 1; - } - } - - // last incomplete block - LEN = LEN % RATE8; - st, buf = __addstate_imem_avx2(st, buf, LEN, TRAILB); - if (TRAILB!=0) { st = __addratebit_avx2(st, RATE8); } - - return st, buf; -} - -/* - INCREMENTAL (FIXED-SIZE) MEMORY ABSORB - ====================================== -*/ - -inline fn __pstate_imem_avx2 -( reg mut ptr u64[25] pst -, inline int AT /* bytes (0 <= AT < 200) */ -, reg u64 buf -, inline int LEN -, inline int TRAILB -) -> reg ptr u64[25] /* pst */ - , inline int /* AT */ - , reg u64 /* buf */ -{ - inline int LO, ALL; - reg u64 at, t64; - reg u256 t256; - reg u128 t128; - - ALL = AT+LEN; // total bytes to process (excluding trail byte, if !=0) - LO = AT % 8; // leftover bytes - at = AT / 8; // current pstate position - - if ( 0 < LO ) { // process first word... - if ( LO+LEN < 8) { // ...not enough to fill a word (just update it) - if ( TRAILB != 0 ) { ALL += 1; } - buf, _, TRAILB, t64 = __mread_subu64(buf, LEN, TRAILB); - t64 <<= 8*LO; - pst[(int) at] ^= t64; - LO = 0; - AT = 0; - LEN = 0; - } else { // process first word - if ( 8 <= LEN ) { - t64 = (u64)[buf]; - buf += (8-LO); - } else { - buf, _, _, t64 = __mread_subu64(buf, 8-LO, 0); - } - LEN -= 8-LO; - AT += 8-LO; - t64 <<= 8*LO; - pst[(int) at] ^= t64; - at += 1; - } - } - - // continue processing remaining bytes - if (32 <= LEN) { - while ( at < AT/8+4*(LEN/32)) { - t256 = (u256)[buf]; - buf += 32; - pst.[u256 8*at] = t256; - at += 4; - } - LEN = LEN % 32; - } - if (16 <= LEN) { - t128 = (u128)[buf]; - buf += 16; - pst.[u128 8*at] = t128; - at += 2; - LEN -= 16; - } - if (8 <= LEN) { - t64 = (u64)[buf]; - buf += 8; - pst.[u64 8*at] = t64; - at += 1; - LEN -= 8; - } - - // process last word (possibly closing the state) - LO = (AT+LEN) % 8; - if ( 0 < LEN || TRAILB != 0 ) { - if ( TRAILB != 0 ) { ALL += 1; } - buf, _, TRAILB, t64 = __mread_subu64(buf, LO, TRAILB); - pst[u64 (ALL/8)] = t64; - } - - return pst, ALL, buf; -} - -inline fn __pabsorb_imem_avx2 -( reg mut ptr u64[25] pst -, inline int AT -, reg u256[7] st -, reg u64 buf -, inline int LEN -, inline int RATE8 -, inline int TRAILB /* closes state if !=0 (i.e. adds trailbyte and padding) */ -) -> reg ptr u64[25] /* pst */ - , inline int /* AT */ - , reg u256[7] /* st */ - , reg u64 /* buf */ -{ - reg u64 i; - inline int ALL, ITERS; - - ALL = AT + LEN; - if ( (AT+LEN) < RATE8 ) { // not enough to fill a block! - pst, AT, buf = __pstate_imem_avx2(pst, AT, buf, LEN, TRAILB); - if (TRAILB != 0) { // add pstate and closes the state - i = AT/8 + 1; - if (AT <= 5*8) { // only st[0..1] is affected - while (i < 5) { pst[i] = 0; i += 1; } - st = __addpst01(st, pst); - st = __addratebit_avx2(st, RATE8); - } else { // all state is affected - while (i < RATE8/8) { pst[i] = 0; i += 1; } - pst[u8 RATE8-1] ^= 0x80; - st = _addpstate_avx2(st, pst); - } - } - } else { // at least a block is filled - if ( AT != 0 ) { // start by filling the first block - pst, _, buf = __pstate_imem_avx2(pst, AT, buf, RATE8-AT, 0); - LEN = LEN - (RATE8-AT); - st = _addpstate_avx2(st, pst); - st = _keccakf1600_avx2(st); - AT = 0; - } - - // continue by processing full blocks - ITERS = LEN / RATE8; // number of full blocks - i = 0; - while ( i < ITERS ) { - st, buf = __addstate_imem_avx2(st, buf, RATE8, 0); - st = _keccakf1600_avx2(st); - i += 1; - } - - // last incomplete block - LEN = ALL % RATE8; - if (TRAILB!=0) { - st, buf = __addstate_imem_avx2(st, buf, LEN, TRAILB); - st = __addratebit_avx2(st, RATE8); - AT = 0; - } else if ( LEN != 0) { - pst, AT, buf = __pstate_imem_avx2(pst, 0, buf, LEN, TRAILB); - } - } - return pst, AT, st, buf; -} - -/* - ONE-SHOT (FIXED-SIZE) MEMORY SQUEEZE - ==================================== -*/ - -inline fn __dumpstate_imem_avx2 -( reg u64 buf -, inline int LEN -, reg u256[7] st -) -> reg u64 -{ - reg u64 t; - reg u128 t128_0, t128_1; - reg u256 t256_0, t256_1, t256_2, t256_3, t256_4; - - // reg0 - if (8 <= LEN) { - buf, _ = __mwrite_subu256(buf, 8, st[0]); - LEN -= 8; - } else { - buf, LEN = __mwrite_subu256(buf, LEN, st[0]); - } - - // reg1 - buf, LEN = __mwrite_subu256(buf, LEN, st[1]); - - // reg2 (5) - if (0 reg u64 /* buf */ - , reg u256[7] /* st */ -{ - reg u64 i; - inline int ITERS, LO; - ITERS = LEN/RATE8; - LO = LEN%RATE8; - if (0 reg u256[7] -{ - inline int i; - reg u256[7] state; - - for i=0 to 7 - { state[i] = #set0_256(); } - - return state; -} - - -inline fn __init_s_state_avx2() -> stack u64[28] -{ - inline int i; - stack u64[28] s_state; - reg u256 zero; - - zero = #set0_256(); - for i=0 to 7 - { s_state[u256 i] = zero; } - - return s_state; -} - - -inline fn __add_full_block_avx2( - reg u256[7] state, - stack u64[28] s_state, - reg ptr u64[25] a_jagged_p, - reg u64 in inlen, - reg u64 rate -) -> reg u256[7], stack u64[28], reg u64, reg u64 -{ - - inline int i; - reg u64 j l t rate8; - - rate8 = rate; - rate8 >>= 3; - j = 0; - while ( j < rate8 ) - { - t = [in + 8*j]; - l = a_jagged_p[(int) j]; - s_state[(int) l] = t; - j += 1; - } - - //TODO: check & change to #VPBROADCAST_4u64 - t = s_state[0]; - s_state[1] = t; - s_state[2] = t; - s_state[3] = t; - - for i = 0 to 7 - { state[i] ^= s_state[u256 i]; } - - in += rate; - inlen -= rate; - - return state, s_state, in, inlen; -} - - -// TODO: refactor when this feature is available: https://github.com/haslab/libjbn/wiki/Feature-request-%231#procedural-parameters -inline fn __add_final_block_avx2( - reg u256[7] state, - stack u64[28] s_state, - reg ptr u64[25] a_jagged_p, - reg u64 in inlen, - reg u8 trail_byte, - reg u64 rate -) -> reg u256[7] -{ - inline int i; - reg u64 j l t inlen8; - reg u8 c; - - s_state = __init_s_state_avx2(); - - inlen8 = inlen; - inlen8 >>= 3; - j = 0; - while ( j < inlen8 ) - { - t = [in + 8*j]; - l = a_jagged_p[(int) j]; - s_state[(int) l] = t; - j += 1; - } - l = a_jagged_p[(int) j]; - l <<= 3; - j <<= 3; - - while ( j < inlen ) - { - c = (u8)[in + j]; - s_state[u8 (int) l] = c; - j += 1; - l += 1; - } - - s_state[u8 (int) l] = trail_byte; - - // j = (rate-1) >> 3; - j = rate; j -= 1; j >>= 3; - l = a_jagged_p[(int) j]; - l <<= 3; - // l += ((rate-1) & 0x7) - j = rate; j -= 1; j &= 0x7; - l += j; - - s_state[u8 (int) l] ^= 0x80; - - t = s_state[0]; - s_state[1] = t; - s_state[2] = t; - s_state[3] = t; - - for i = 0 to 7 - { state[i] ^= s_state[u256 i]; } - - return state; -} - - -// obs: @pre: len <= rate_in_bytes -inline fn __xtr_full_block_avx2( - reg u256[7] state, - reg ptr u64[25] a_jagged_p, - reg u64 out, - reg u64 len -) -> reg u64 -{ - inline int i; - stack u64[28] s_state; - reg u64 j l t len8; - - for i = 0 to 7 - { s_state[u256 i] = state[i]; } - - len8 = len; - len8 >>= 3; - j = 0; - while ( j < len8 ) - { - l = a_jagged_p[(int) j]; - t = s_state[(int) l]; - [out + 8*j] = t; - j += 1; - } - - out += len; - - return out; -} - - -// obs: @pre: len <= rate_in_bytes -inline fn __xtr_bytes_avx2( - reg u256[7] state, - reg ptr u64[25] a_jagged_p, - reg u64 out, - reg u64 len -) -> reg u64 -{ - inline int i; - stack u64[28] s_state; - reg u64 j l t len8; - reg u8 c; - - for i = 0 to 7 - { s_state[u256 i] = state[i]; } - - len8 = len; - len8 >>= 3; - j = 0; - while ( j < len8 ) - { l = a_jagged_p[(int) j]; - t = s_state[(int) l]; - [out + 8*j] = t; - j += 1; - } - l = a_jagged_p[(int)j]; - j <<= 3; - l <<= 3; - - while ( j < len ) - { - c = s_state[u8 (int) l]; - (u8)[out + j] = c; - j += 1; - l += 1; - } - - out += len; - - return out; -} - - -inline fn __absorb_avx2( - reg u256[7] state, - reg u64 in inlen, - reg u8 trail_byte, - reg u64 rate -) -> reg u256[7] -{ - stack u64[28] s_state; - reg ptr u64[25] a_jagged_p; - - a_jagged_p = KECCAK_A_JAGGED; - s_state = __init_s_state_avx2(); - - // intermediate blocks - while ( inlen >= rate ) - { - state, s_state, in, inlen = __add_full_block_avx2(state, s_state, a_jagged_p, in, inlen, rate); - state = _keccakf1600_avx2_(state); - } - - // final block - state = __add_final_block_avx2(state, s_state, a_jagged_p, in, inlen, trail_byte, rate); - - return state; -} - - -inline fn __squeeze_avx2(reg u256[7] state, reg u64 out outlen rate) -> reg u256[7] -{ - reg ptr u64[25] a_jagged_p; - - a_jagged_p = KECCAK_A_JAGGED; - - // intermediate blocks - while ( outlen > rate ) - { - state = _keccakf1600_avx2_(state); - out = __xtr_full_block_avx2(state, a_jagged_p, out, rate); - outlen -= rate; - } - - state = _keccakf1600_avx2_(state); - out = __xtr_bytes_avx2(state, a_jagged_p, out, outlen); - return state; -} - - -inline fn __keccak1600_avx2(reg u64 out outlen in inlen, reg u8 trail_byte, reg u64 rate) -{ - reg u256[7] state; - - state = __keccak_init_avx2(); - - // absorb - state = __absorb_avx2(state, in, inlen, trail_byte, rate); - - // squeeze - _ = __squeeze_avx2(state, out, outlen, rate); -} - - -fn _keccak1600_avx2(reg u64 out outlen in inlen, reg u8 trail_byte, reg u64 rate) -{ - __keccak1600_avx2(out, outlen, in, inlen, trail_byte, rate); -} - diff --git a/code/jasmin/mlkem_avx2/keccak/keccak1600_orig_avx2_ASIZE.jinc b/code/jasmin/mlkem_avx2/keccak/keccak1600_orig_avx2_ASIZE.jinc deleted file mode 100644 index 7d4c3e25..00000000 --- a/code/jasmin/mlkem_avx2/keccak/keccak1600_orig_avx2_ASIZE.jinc +++ /dev/null @@ -1,247 +0,0 @@ -/* DEPENDENCIES: -require "keccak1600_orig_avx2.jinc" -param int ASIZE=101; -*/ - -inline fn __addarray_full_block_avx2 -( reg u256[7] state -, stack u64[28] s_state -, reg ptr u64[25] a_jagged_p -, reg const ptr u8[ASIZE] in -, reg u64 offset -, reg u64 inlen -, reg u64 rate -) -> reg u256[7] /* st */ - , stack u64[28] /* pst */ - , reg u64 /* offset */ - , reg u64 /* inlen */ -{ - - inline int i; - reg u64 j l t rate8; - - rate8 = rate; - rate8 >>= 3; - j = 0; - while ( j < rate8 ) - { - t = in.[u64 offset + 8*j]; - l = a_jagged_p[(int) j]; - s_state[(int) l] = t; - j += 1; - } - - //TODO: check & change to #VPBROADCAST_4u64 - t = s_state[0]; - s_state[1] = t; - s_state[2] = t; - s_state[3] = t; - - for i = 0 to 7 - { state[i] ^= s_state[u256 i]; } - - offset += rate; - inlen -= rate; - - return state, s_state, offset, inlen; -} - - -// TODO: refactor when this feature is available: https://github.com/haslab/libjbn/wiki/Feature-request-%231#procedural-parameters -inline fn __addarray_final_block_avx2 -( reg u256[7] state -, stack u64[28] s_state -, reg ptr u64[25] a_jagged_p -, reg const ptr u8[ASIZE] in -, reg u64 offset -, reg u64 inlen -, reg u8 trail_byte -, reg u64 rate -) -> reg u256[7] -{ - inline int i; - reg u64 j l t inlen8; - reg u8 c; - - s_state = __init_s_state_avx2(); - - inlen8 = inlen; - inlen8 >>= 3; - j = 0; - while ( j < inlen8 ) - { - t = in.[u64 offset + 8*j]; - l = a_jagged_p[(int) j]; - s_state[(int) l] = t; - j += 1; - } - l = a_jagged_p[(int) j]; - l <<= 3; - j <<= 3; - - while ( j < inlen ) - { - c = in.[u8 offset + j]; - s_state[u8 (int) l] = c; - j += 1; - l += 1; - } - - s_state[u8 (int) l] = trail_byte; - - // j = (rate-1) >> 3; - j = rate; j -= 1; j >>= 3; - l = a_jagged_p[(int) j]; - l <<= 3; - // l += ((rate-1) & 0x7) - j = rate; j -= 1; j &= 0x7; - l += j; - - s_state[u8 (int) l] ^= 0x80; - - t = s_state[0]; - s_state[1] = t; - s_state[2] = t; - s_state[3] = t; - - for i = 0 to 7 - { state[i] ^= s_state[u256 i]; } - - return state; -} - - -// obs: @pre: len <= rate_in_bytes -inline fn __xtrarray_full_block_avx2 -( reg u256[7] state -, reg ptr u64[25] a_jagged_p -, reg mut ptr u8[ASIZE] out -, reg u64 offset -, reg u64 len -) -> reg ptr u8[ASIZE] /* out */ - , reg u64 /* offset */ -{ - inline int i; - stack u64[28] s_state; - reg u64 j l t len8; - - for i = 0 to 7 - { s_state[u256 i] = state[i]; } - - len8 = len; - len8 >>= 3; - j = 0; - while ( j < len8 ) - { - l = a_jagged_p[(int) j]; - t = s_state[(int) l]; - out.[u64 offset + 8*j] = t; - j += 1; - } - - offset += len; - - return out, offset; -} - - -// obs: @pre: len <= rate_in_bytes -inline fn __xtrarray_bytes_avx2 -( reg u256[7] state -, reg ptr u64[25] a_jagged_p -, reg mut ptr u8[ASIZE] out -, reg u64 offset -, reg u64 len -) -> reg ptr u8[ASIZE] /* out */ - , reg u64 /* offset */ -{ - inline int i; - stack u64[28] s_state; - reg u64 j l t len8; - reg u8 c; - - for i = 0 to 7 - { s_state[u256 i] = state[i]; } - - len8 = len; - len8 >>= 3; - j = 0; - while ( j < len8 ) - { l = a_jagged_p[(int) j]; - t = s_state[(int) l]; - out.[u64 offset + 8*j] = t; - j += 1; - } - l = a_jagged_p[(int)j]; - j <<= 3; - l <<= 3; - - while ( j < len ) - { - c = s_state[u8 (int) l]; - out.[u8 offset + j] = c; - j += 1; - l += 1; - } - - offset += len; - - return out, offset; -} - - -inline fn __absorbarray_avx2 -( reg const ptr u8[ASIZE] in -, reg u64 offset -, reg u64 inlen -, reg u8 trail_byte -, reg u64 rate -) -> reg u256[7] -{ reg u256[7] state; - stack u64[28] s_state; - reg ptr u64[25] a_jagged_p; - - a_jagged_p = KECCAK_A_JAGGED; - s_state = __init_s_state_avx2(); - - // intermediate blocks - while ( inlen >= rate ) - { - state, s_state, offset, inlen = __addarray_full_block_avx2(state, s_state, a_jagged_p, in, offset, inlen, rate); - state = _keccakf1600_avx2_(state); - } - - // final block - state = __addarray_final_block_avx2(state, s_state, a_jagged_p, in, offset, inlen, trail_byte, rate); - - return state; -} - - -inline fn __squeezearray_avx2 -( reg u256[7] state -, reg mut ptr u8[ASIZE] out -, reg u64 offset -, reg u64 outlen -, reg u64 rate -) -> reg ptr u8[ASIZE] /* out */ - , reg u64 /* offset */ - , reg u256[7] /* state */ -{ - reg ptr u64[25] a_jagged_p; - - a_jagged_p = KECCAK_A_JAGGED; - - // intermediate blocks - while ( outlen > rate ) - { - state = _keccakf1600_avx2_(state); - out, offset = __xtrarray_full_block_avx2(state, a_jagged_p, out, offset, rate); - outlen -= rate; - } - - state = _keccakf1600_avx2_(state); - out, offset = __xtrarray_bytes_avx2(state, a_jagged_p, out, offset, outlen); - return out, offset, state; -} - diff --git a/code/jasmin/mlkem_avx2/keccak/keccak1600x4_array_avx2_ASIZE.jinc b/code/jasmin/mlkem_avx2/keccak/keccak1600x4_array_avx2_ASIZE.jinc deleted file mode 100644 index dab79aef..00000000 --- a/code/jasmin/mlkem_avx2/keccak/keccak1600x4_array_avx2_ASIZE.jinc +++ /dev/null @@ -1,416 +0,0 @@ -/* DEPENDENCIES: -require "keccak1600x4_avx2.jinc" -param int ASIZE = 1001; -*/ - -require "subreadwrite_array_ASIZE.jinc" - - -/* - INCREMENTAL ARRAY BROADCAST ABSORB - ================================== -*/ - -inline fn __addstate_bcast_array_avx2x4 -( reg mut ptr u256[25] st -, inline int AT /* bytes (0 <= AT < 200) */ -, reg const ptr u8[ASIZE] buf -, reg u64 offset -, inline int LEN -, inline int TRAILB -) -> reg ptr u256[25] /* st */ - , inline int /* AT */ - , reg u64 /* offset */ -{ - inline int DELTA, LO, ALL; - reg u64 at; - reg u256 t256; - - ALL = AT+LEN; // total bytes to process (excluding trail byte, if !=0) - LO = AT % 8; // leftover bytes - at = 32 * (AT / 8); // current pstate position - DELTA = 0; - - if ( 0 < LO ) { // process first word... - if ( LO+LEN < 8) { // ...not enough to fill a word (just update it) - if ( TRAILB != 0 ) { ALL += 1; } - DELTA, _, TRAILB, t256 = __aread_bcast_4subu64(buf, offset, DELTA, LEN, TRAILB); - t256 = #VPSLL_4u64(t256, 8*LO); - t256 ^= st.[u256 (int) at]; - st.[u256 (int) at] = t256; - LO = 0; - AT = 0; - LEN = 0; - } else { // process first word - if ( 8 <= LEN ) { - t256 = #VPBROADCAST_4u64(buf.[u64 offset + DELTA]); - DELTA += (8-LO); - } else { - DELTA, _, _, t256 = __aread_bcast_4subu64(buf, offset, DELTA, 8-LO, 0); - } - LEN -= 8-LO; - AT += 8-LO; - t256 = #VPSLL_4u64(t256, 8*LO); - t256 ^= st.[u256 (int) at]; - st.[u256 (int) at] = t256; - at += 32; - } - } - - offset += DELTA; - DELTA = 0; - // continue processing remaining bytes - if (8 <= LEN) { - while ( at < 32*(AT/8)+32*(LEN/8)) { - t256 = #VPBROADCAST_4u64(buf.[u64 offset]); - offset += 8; - t256 ^= st.[u256 at]; - st.[u256 at] = t256; - at += 32; - } - LEN = (AT+LEN) % 8; - } - - // process last word (possibly closing the state) - LO = (AT+LEN) % 8; - if ( 0 < LO || TRAILB != 0 ) { - if ( TRAILB != 0 ) { ALL += 1; } - DELTA, _, TRAILB, t256 = __aread_bcast_4subu64(buf, offset, DELTA, LO, TRAILB); - offset += DELTA; - t256 ^= st.[u256 at]; - st.[u256 at] = t256; - } - return st, ALL, offset; -} - -inline fn __absorb_bcast_array_avx2x4 -( reg mut ptr u256[25] st -, inline int AT -, reg const ptr u8[ASIZE] buf -, reg u64 offset -, inline int LEN -, inline int RATE8 -, inline int TRAILB /* closes state if !=0 (i.e. adds trailbyte and padding) */ -) -> reg ptr u256[25] /* st */ - , inline int /* AT */ - , reg u64 /* offset */ -{ - reg u64 i; - inline int ALL, ITERS; - - ALL = AT + LEN; - if ( (AT+LEN) < RATE8 ) { // not enough to fill a block! - st, AT, offset = __addstate_bcast_array_avx2x4(st, AT, buf, offset, LEN, TRAILB); - if (TRAILB != 0) { // add pstate and closes the state - st = __addratebit_avx2x4(st, RATE8); - } - } else { // at least a block is filled - if ( AT != 0 ) { // start by filling the first block - st, _, offset = __addstate_bcast_array_avx2x4(st, AT, buf, offset, RATE8-AT, 0); - LEN = LEN - (RATE8-AT); - st = _keccakf1600_avx2x4(st); - AT = 0; - } - - // continue by processing full blocks - ITERS = LEN / RATE8; // number of full blocks - i = 0; - while ( i < ITERS ) { - st, _, offset = __addstate_bcast_array_avx2x4(st, 0, buf, offset, RATE8, 0); - st = _keccakf1600_avx2x4(st); - i += 1; - } - - // last incomplete block - LEN = ALL % RATE8; - st, AT, offset = __addstate_bcast_array_avx2x4(st, 0, buf, offset, LEN, TRAILB); - if (TRAILB!=0) { st = __addratebit_avx2x4(st, RATE8); } - } - return st, AT, offset; -} - -/* - INCREMENTAL (FIXED-SIZE) MEMORY 4-way ABSORB - ============================================ -*/ - -inline fn __addstate_array_avx2x4 -( reg mut ptr u256[25] st -, inline int AT /* bytes (0 <= AT < 200) */ -, reg const ptr u8[ASIZE] buf0 buf1 buf2 buf3 -, reg u64 offset -, inline int LEN -, inline int TRAILB -) -> reg ptr u256[25] /* st */ - , inline int /* AT */ - , reg u64 /* offset */ -{ - inline int DELTA, LO, ALL; - reg u64 at, t0, t1, t2, t3; - reg u256 t256_0, t256_1, t256_2, t256_3; - - ALL = AT+LEN; // total bytes to process (excluding trail byte, if !=0) - LO = AT % 8; // leftover bytes - at = 4 * (AT / 8); // current pstate position (referencing u64 words) -//at = 0, 4, 8, ... - DELTA = 0; - - if ( 0 < LO ) { // process first word... - if ( LO+LEN < 8) { // ...not enough to fill a word (just update it) - if ( TRAILB != 0 ) { ALL += 1; } - _, _, _, t0 = __aread_subu64(buf0, offset, DELTA, LEN, TRAILB); - _, _, _, t1 = __aread_subu64(buf1, offset, DELTA, LEN, TRAILB); - _, _, _, t2 = __aread_subu64(buf2, offset, DELTA, LEN, TRAILB); - DELTA, _, _, t3 = __aread_subu64(buf3, offset, DELTA, LEN, TRAILB); - t0 <<= 8*LO; - t0 ^= st[u64 at + 0]; - st[u64 at + 0] = t0; - t1 <<= 8*LO; - t1 ^= st[u64 at + 1]; - st[u64 at + 1] = t1; - t2 <<= 8*LO; - t2 ^= st[u64 at + 2]; - st[u64 at + 2] = t2; - t3 <<= 8*LO; - t3 ^= st[u64 at + 3]; - st[u64 at + 3] = t3; - LO = 0; - AT = 0; - LEN = 0; - TRAILB = 0; - } else { // process first word - if ( 8 <= LEN ) { - t0 = buf0.[u64 offset + DELTA]; - t1 = buf1.[u64 offset + DELTA]; - t2 = buf2.[u64 offset + DELTA]; - t3 = buf3.[u64 offset + DELTA]; - offset += 8-LO; - } else { - _, _, _, t0 = __aread_subu64(buf0, offset, DELTA, 8-LO, TRAILB); - _, _, _, t1 = __aread_subu64(buf1, offset, DELTA, 8-LO, TRAILB); - _, _, _, t2 = __aread_subu64(buf2, offset, DELTA, 8-LO, TRAILB); - DELTA, _, _, t3 = __aread_subu64(buf3, offset, DELTA, 8-LO, TRAILB); - } - LEN -= 8-LO; - AT += 8-LO; - t0 <<= 8*LO; - t0 ^= st[u64 at + 0]; - st[u64 at + 0] = t0; - t1 <<= 8*LO; - t1 ^= st[u64 at + 1]; - st[u64 at + 1] = t1; - t2 <<= 8*LO; - t2 ^= st[u64 at + 2]; - st[u64 at + 2] = t2; - t3 <<= 8*LO; - t3 ^= st[u64 at + 3]; - st[u64 at + 3] = t3; - at += 4; - } - } - offset += DELTA; - DELTA = 0; - // continue processing remaining bytes - if (8 <= LEN) { - while ( at < 4*(AT/8)+32*(LEN/32) ) { - t256_0 = buf0.[u256 offset]; - t256_1 = buf1.[u256 offset]; - t256_2 = buf0.[u256 offset]; - t256_3 = buf0.[u256 offset]; - offset += 32; - t256_0, t256_1, t256_2, t256_3 = __4u64x4_u256x4(t256_0, t256_1, t256_2, t256_3); - st.[u256 8*at] = t256_0; - st.[u256 8*at+32] = t256_1; - st.[u256 8*at+64] = t256_2; - st.[u256 8*at+96] = t256_3; - at += 32; - } - while ( at < 4*(AT/8)+4*(LEN/8)) { - t0 = buf0.[u64 offset]; - t0 ^= st[u64 at + 0]; - st[u64 at + 0] = t0; - t1 = buf1.[u64 offset]; - t1 ^= st[u64 at + 1]; - st[u64 at + 1] = t1; - t2 = buf2.[u64 offset]; - t2 ^= st[u64 at + 2]; - st[u64 at + 2] = t2; - t3 = buf3.[u64 offset]; - offset += 8; - t3 ^= st[u64 at + 3]; - st[u64 at + 3] = t3; - at += 4; - } - LEN = (AT+LEN) % 8; - } - - // process last word (possibly closing the state) - LO = (AT+LEN) % 8; - if ( 0 < LO || TRAILB != 0 ) { - _, _, _, t0 = __aread_subu64(buf0, offset, DELTA, LO, TRAILB); - _, _, _, t1 = __aread_subu64(buf1, offset, DELTA, LO, TRAILB); - _, _, _, t2 = __aread_subu64(buf2, offset, DELTA, LO, TRAILB); - DELTA, _, _, t3 = __aread_subu64(buf3, offset, DELTA, LO, TRAILB); - offset += DELTA; - if ( TRAILB != 0 ) { ALL += 1; TRAILB = 0; } - t0 ^= st[u64 at + 0]; - st[u64 at + 0] = t0; - t1 ^= st[u64 at + 1]; - st[u64 at + 1] = t1; - t2 ^= st[u64 at + 2]; - st[u64 at + 2] = t2; - t3 ^= st[u64 at + 3]; - st[u64 at + 3] = t3; - } - - return st, ALL, offset; -} - - -inline fn __absorb_array_avx2x4 -( reg mut ptr u256[25] st -, inline int AT -, reg const ptr u8[ASIZE] buf0 buf1 buf2 buf3 -, reg u64 offset -, inline int LEN -, inline int RATE8 -, inline int TRAILB /* closes state if !=0 (i.e. adds trailbyte and padding) */ -) -> reg ptr u256[25] /* st */ - , inline int /* AT */ - , reg u64 /* offset */ -{ - reg u64 i; - inline int ALL, ITERS; - - ALL = AT + LEN; - if ( (AT+LEN) < RATE8 ) { // not enough to fill a block! - st, AT, offset - = __addstate_array_avx2x4(st, AT, buf0, buf1, buf2, buf3, offset, LEN, TRAILB); - if (TRAILB != 0) { // add pstate and closes the state - st = __addratebit_avx2x4(st, RATE8); - } - } else { // at least a block is filled - if ( AT != 0 ) { // start by filling the first block - st, _, offset - = __addstate_array_avx2x4(st, AT, buf0, buf1, buf2, buf3, offset, RATE8-AT, 0); - LEN = LEN - (RATE8-AT); - st = _keccakf1600_avx2x4(st); - AT = 0; - } - - // continue by processing full blocks - ITERS = LEN / RATE8; // number of full blocks - i = 0; - while ( i < ITERS ) { - st, _, offset - = __addstate_array_avx2x4(st, 0, buf0, buf1, buf2, buf3, offset, RATE8, 0); - st = _keccakf1600_avx2x4(st); - i += 1; - } - - // last incomplete block - LEN = ALL % RATE8; - st, AT, offset - = __addstate_array_avx2x4(st, 0, buf0, buf1, buf2, buf3, offset, LEN, TRAILB); - if (TRAILB!=0) { st = __addratebit_avx2x4(st, RATE8); } - } - return st, AT, offset; -} - - -/* - ONE-SHOT (FIXED-SIZE) MEMORY SQUEEZE - ==================================== -*/ -inline fn __dumpstate_array_avx2x4 -( reg mut ptr u8[ASIZE] buf0 buf1 buf2 buf3 -, reg u64 offset -, inline int LEN -, reg const ptr u256[25] st -) -> reg ptr u8[ASIZE] /* buf0 */ - , reg ptr u8[ASIZE] /* buf1 */ - , reg ptr u8[ASIZE] /* buf2 */ - , reg ptr u8[ASIZE] /* buf3 */ - , reg u64 /* offset */ -{ - reg u256 x0, x1, x2, x3; - reg u64 i, t0, t1, t2, t3; - i = 0; - while (i reg ptr u8[ASIZE] /* buf0 */ - , reg ptr u8[ASIZE] /* buf1 */ - , reg ptr u8[ASIZE] /* buf2 */ - , reg ptr u8[ASIZE] /* buf3 */ - , reg u64 /* offset */ - , reg ptr u256[25] /* st */ -{ - reg u64 i; - inline int ITERS, LO; - ITERS = LEN/RATE8; - LO = LEN%RATE8; - - if (0 reg ptr u256[25] -{ - reg u64 i; - reg u256 z256; - z256 = #set0_256(); - i = 0; - while (i < 32*25) { - st.[u256 (int) i] = z256; - i += 32; - } - return st; -} - -/* - ADD RATE BIT - ============ -*/ - -inline fn __addratebit_avx2x4 -( reg mut ptr u256[25] st -, inline int RATE8 -) -> reg ptr u256[25] -{ - reg u256 t256; - reg u128 t128; - reg u64 t64; - t64 = 1; - t64 <<= (8*RATE8-1) % 64; // obs: should be 63 for all admissible rates! - t128 = (128u) t64; - t256 = #VPBROADCAST_4u64(t128); - t256 ^= st[(RATE8-1)/8]; - st[(RATE8-1)/8] = t256; - return st; -} - -/* - State25 to/from State4x25 - ========================= -*/ -// pack 4 keccak states (st25) into a 4-way state (st4x) -inline fn __u256x4_4u64x4 -( reg u256 x0 x1 x2 x3 -) -> reg u256, reg u256, reg u256, reg u256 { - // x0 = l00 l01 l02 l03 - // x1 = l10 l11 l12 l13 - // x2 = l20 l21 l22 l23 - // x3 = l30 l31 l32 l33 - reg u256 y0, y1, y2, y3; - y0 = #VPUNPCKL_4u64(x0, x1); // y0 = l00 l10 l02 l12 - y1 = #VPUNPCKH_4u64(x0, x1); // y1 = l01 l11 l03 l13 - y2 = #VPUNPCKL_4u64(x2, x3); // y2 = l20 l30 l22 l32 - y3 = #VPUNPCKH_4u64(x2, x3); // y3 = l21 l31 l23 l33 - - x0 = #VPERM2I128(y0, y2, 0x20); // x0 = l00 l10 l20 l30 - x1 = #VPERM2I128(y1, y3, 0x20); // x1 = l01 l11 l21 l31 - x2 = #VPERM2I128(y0, y2, 0x31); // x2 = l02 l12 l22 l32 - x3 = #VPERM2I128(y1, y3, 0x31); // x3 = l03 l13 l23 l33 - - return x0, x1, x2, x3; -} - -// extracts 4 keccak states (st25) from a 4-way state (st4x) -inline fn __4u64x4_u256x4 -( reg u256 y0 y1 y2 y3 -) -> reg u256, reg u256, reg u256, reg u256 { - // y0 = l00 l10 l20 l30 - // y1 = l01 l11 l21 l31 - // y2 = l02 l12 l22 l32 - // y3 = l03 l13 l23 l33 - reg u256 x0, x1, x2, x3; - x0 = #VPERM2I128(y0, y2, 0x20); // x0 = l00 l10 l02 l12 - x1 = #VPERM2I128(y1, y3, 0x20); // x1 = l01 l11 l03 l13 - x2 = #VPERM2I128(y0, y2, 0x31); // x2 = l20 l30 l22 l32 - x3 = #VPERM2I128(y1, y3, 0x31); // x3 = l21 l31 l23 l33 - - y0 = #VPUNPCKL_4u64(x0, x1); // y0 = l00 l01 l02 l03 - y1 = #VPUNPCKH_4u64(x0, x1); // y1 = l10 l11 l12 l13 - y2 = #VPUNPCKL_4u64(x2, x3); // y2 = l20 l21 l22 l23 - y3 = #VPUNPCKH_4u64(x2, x3); // y3 = l30 l31 l32 l33 - - return y0, y1, y2, y3; -} diff --git a/code/jasmin/mlkem_avx2/keccak/keccak1600x4_imem_avx2.jinc b/code/jasmin/mlkem_avx2/keccak/keccak1600x4_imem_avx2.jinc deleted file mode 100644 index 1733146c..00000000 --- a/code/jasmin/mlkem_avx2/keccak/keccak1600x4_imem_avx2.jinc +++ /dev/null @@ -1,421 +0,0 @@ -require "keccak1600x4_avx2.jinc" - - - - -/* - INCREMENTAL (FIXED-SIZE) MEMORY BROADCAST ABSORB - ================================================ -*/ - -inline fn __addstate_bcast_imem_avx2x4 -( reg mut ptr u256[25] st -, inline int AT /* bytes (0 <= AT < 200) */ -, reg u64 buf -, inline int LEN -, inline int TRAILB -) -> reg ptr u256[25] /* st */ - , inline int /* AT */ - , reg u64 /* buf */ -{ - inline int LO, ALL; - reg u64 at; - reg u256 t256; - - ALL = AT+LEN; // total bytes to process (excluding trail byte, if !=0) - LO = AT % 8; // leftover bytes - at = 32 * (AT / 8); // current pstate position - - if ( 0 < LO ) { // process first word... - if ( LO+LEN < 8) { // ...not enough to fill a word (just update it) - if ( TRAILB != 0 ) { ALL += 1; } - buf, _, TRAILB, t256 = __mread_bcast_4subu64(buf, LEN, TRAILB); - t256 = #VPSLL_4u64(t256, 8*LO); - t256 ^= st.[u256 (int) at]; - st.[u256 (int) at] = t256; - LO = 0; - AT = 0; - LEN = 0; - } else { // process first word - if ( 8 <= LEN ) { - t256 = #VPBROADCAST_4u64((u64)[buf]); - buf += (8-LO); - } else { - buf, _, _, t256 = __mread_bcast_4subu64(buf, 8-LO, 0); - } - LEN -= 8-LO; - AT += 8-LO; - t256 = #VPSLL_4u64(t256, 8*LO); - t256 ^= st.[u256 (int) at]; - st.[u256 (int) at] = t256; - at += 32; - } - } - - // continue processing remaining bytes - if (8 <= LEN) { - while ( at < 32*(AT/8)+32*(LEN/8)) { - t256 = #VPBROADCAST_4u64((u64)[buf]); - buf += 8; - t256 ^= st.[u256 at]; - st.[u256 at] = t256; - at += 32; - } - LEN = (AT+LEN) % 8; - } - - // process last word (possibly closing the state) - LO = (AT+LEN) % 8; - if ( 0 < LO || TRAILB != 0 ) { - if ( TRAILB != 0 ) { ALL += 1; } - buf, _, TRAILB, t256 = __mread_bcast_4subu64(buf, LO, TRAILB); - t256 ^= st.[u256 at]; - st.[u256 at] = t256; - } - - return st, ALL, buf; -} - -inline fn __absorb_bcast_imem_avx2x4 -( reg mut ptr u256[25] st -, inline int AT -, reg u64 buf -, inline int LEN -, inline int RATE8 -, inline int TRAILB /* closes state if !=0 (i.e. adds trailbyte and padding) */ -) -> reg ptr u256[25] /* st */ - , inline int /* AT */ - , reg u64 /* buf */ -{ - reg u64 i; - inline int ALL, ITERS; - - ALL = AT + LEN; - if ( (AT+LEN) < RATE8 ) { // not enough to fill a block! - st, AT, buf = __addstate_bcast_imem_avx2x4(st, AT, buf, LEN, TRAILB); - if (TRAILB != 0) { // add pstate and closes the state - st = __addratebit_avx2x4(st, RATE8); - } - } else { // at least a block is filled - if ( AT != 0 ) { // start by filling the first block - st, _, buf = __addstate_bcast_imem_avx2x4(st, AT, buf, RATE8-AT, 0); - LEN = LEN - (RATE8-AT); - st = _keccakf1600_avx2x4(st); - AT = 0; - } - - // continue by processing full blocks - ITERS = LEN / RATE8; // number of full blocks - i = 0; - while ( i < ITERS ) { - st, _, buf = __addstate_bcast_imem_avx2x4(st, 0, buf, RATE8, 0); - st = _keccakf1600_avx2x4(st); - i += 1; - } - - // last incomplete block - LEN = ALL % RATE8; - st, AT, buf = __addstate_bcast_imem_avx2x4(st, 0, buf, LEN, TRAILB); - if (TRAILB!=0) { st = __addratebit_avx2x4(st, RATE8); } - } - return st, AT, buf; -} - -/* - INCREMENTAL (FIXED-SIZE) MEMORY 4-way ABSORB - ============================================ -*/ - -inline fn __addstate_imem_avx2x4 -( reg mut ptr u256[25] st -, inline int AT /* bytes (0 <= AT < 200) */ -, reg u64 buf0 buf1 buf2 buf3 -, inline int LEN -, inline int TRAILB -) -> reg ptr u256[25] /* st */ - , inline int /* AT */ - , reg u64 /* buf0 */ - , reg u64 /* buf1 */ - , reg u64 /* buf2 */ - , reg u64 /* buf3 */ -{ - inline int LO, ALL; - reg u64 at, t0, t1, t2, t3; - reg u256 t256_0, t256_1, t256_2, t256_3; - - ALL = AT+LEN; // total bytes to process (excluding trail byte, if !=0) - LO = AT % 8; // leftover bytes - at = 4 * (AT / 8); // current pstate position (referencing u64 words) -//at = 0, 4, 8, ... - - if ( 0 < LO ) { // process first word... - if ( LO+LEN < 8) { // ...not enough to fill a word (just update it) - if ( TRAILB != 0 ) { ALL += 1; } - buf0, _, _, t0 = __mread_subu64(buf0, LEN, TRAILB); - buf1, _, _, t1 = __mread_subu64(buf1, LEN, TRAILB); - buf2, _, _, t2 = __mread_subu64(buf2, LEN, TRAILB); - buf3, _, _, t3 = __mread_subu64(buf3, LEN, TRAILB); - t0 <<= 8*LO; - t0 ^= st[u64 at + 0]; - st[u64 at + 0] = t0; - t1 <<= 8*LO; - t1 ^= st[u64 at + 1]; - st[u64 at + 1] = t1; - t2 <<= 8*LO; - t2 ^= st[u64 at + 2]; - st[u64 at + 2] = t2; - t3 <<= 8*LO; - t3 ^= st[u64 at + 3]; - st[u64 at + 3] = t3; - LO = 0; - AT = 0; - LEN = 0; - TRAILB = 0; - } else { // process first word - if ( 8 <= LEN ) { - t0 = (u64)[buf0]; - buf0 += 8-LO; - t1 = (u64)[buf1]; - buf1 += 8-LO; - t2 = (u64)[buf2]; - buf2 += 8-LO; - t3 = (u64)[buf3]; - buf3 += 8-LO; - } else { - buf0, _, _, t0 = __mread_subu64(buf0, 8-LO, TRAILB); - buf1, _, _, t1 = __mread_subu64(buf1, 8-LO, TRAILB); - buf2, _, _, t2 = __mread_subu64(buf2, 8-LO, TRAILB); - buf3, _, _, t3 = __mread_subu64(buf3, 8-LO, TRAILB); - } - LEN -= 8-LO; - AT += 8-LO; - t0 <<= 8*LO; - t0 ^= st[u64 at + 0]; - st[u64 at + 0] = t0; - t1 <<= 8*LO; - t1 ^= st[u64 at + 1]; - st[u64 at + 1] = t1; - t2 <<= 8*LO; - t2 ^= st[u64 at + 2]; - st[u64 at + 2] = t2; - t3 <<= 8*LO; - t3 ^= st[u64 at + 3]; - st[u64 at + 3] = t3; - at += 4; - } - } - - // continue processing remaining bytes - if (8 <= LEN) { - while ( at < 4*(AT/8)+32*(LEN/32) ) { - t256_0 = (u256)[buf0]; - buf0 += 32; - t256_1 = (u256)[buf1]; - buf1 += 32; - t256_2 = (u256)[buf2]; - buf2 += 32; - t256_3 = (u256)[buf3]; - buf3 += 32; - t256_0, t256_1, t256_2, t256_3 = __4u64x4_u256x4(t256_0, t256_1, t256_2, t256_3); - st.[u256 8*at] = t256_0; - st.[u256 8*at+32] = t256_1; - st.[u256 8*at+64] = t256_2; - st.[u256 8*at+96] = t256_3; - at += 32; - } - while ( at < 4*(AT/8)+4*(LEN/8)) { - t0 = (u64)[buf0]; - buf0 += 8; - t0 ^= st[u64 at + 0]; - st[u64 at + 0] = t0; - t1 = (u64)[buf1]; - buf1 += 8; - t1 ^= st[u64 at + 1]; - st[u64 at + 1] = t1; - t2 = (u64)[buf2]; - buf2 += 8; - t2 ^= st[u64 at + 2]; - st[u64 at + 2] = t2; - t3 = (u64)[buf3]; - buf3 += 8; - t3 ^= st[u64 at + 3]; - st[u64 at + 3] = t3; - at += 4; - } - LEN = (AT+LEN) % 8; - } - - // process last word (possibly closing the state) - LO = (AT+LEN) % 8; - if ( 0 < LO || TRAILB != 0 ) { - buf0, _, _, t0 = __mread_subu64(buf0, LO, TRAILB); - buf1, _, _, t1 = __mread_subu64(buf1, LO, TRAILB); - buf2, _, _, t2 = __mread_subu64(buf2, LO, TRAILB); - buf3, _, _, t3 = __mread_subu64(buf3, LO, TRAILB); - if ( TRAILB != 0 ) { ALL += 1; TRAILB = 0; } - t0 ^= st[u64 at + 0]; - st[u64 at + 0] = t0; - t0 ^= st[u64 at + 1]; - st[u64 at + 1] = t1; - t0 ^= st[u64 at + 2]; - st[u64 at + 2] = t2; - t0 ^= st[u64 at + 3]; - st[u64 at + 3] = t3; - } - - return st, ALL, buf0, buf1, buf2, buf3; -} - - -inline fn __absorb_imem_avx2x4 -( reg mut ptr u256[25] st -, inline int AT -, reg u64 buf0 -, reg u64 buf1 -, reg u64 buf2 -, reg u64 buf3 -, inline int LEN -, inline int RATE8 -, inline int TRAILB /* closes state if !=0 (i.e. adds trailbyte and padding) */ -) -> reg ptr u256[25] /* st */ - , inline int /* AT */ - , reg u64 /* buf0 */ - , reg u64 /* buf1 */ - , reg u64 /* buf2 */ - , reg u64 /* buf3 */ -{ - reg u64 i; - inline int ALL, ITERS; - - ALL = AT + LEN; - if ( (AT+LEN) < RATE8 ) { // not enough to fill a block! - st, AT, buf0, buf1, buf2, buf3 - = __addstate_imem_avx2x4(st, AT, buf0, buf1, buf2, buf3, LEN, TRAILB); - if (TRAILB != 0) { // add pstate and closes the state - st = __addratebit_avx2x4(st, RATE8); - } - } else { // at least a block is filled - if ( AT != 0 ) { // start by filling the first block - st, _, buf0, buf1, buf2, buf3 - = __addstate_imem_avx2x4(st, AT, buf0, buf1, buf2, buf3, RATE8-AT, 0); - LEN = LEN - (RATE8-AT); - st = _keccakf1600_avx2x4(st); - AT = 0; - } - - // continue by processing full blocks - ITERS = LEN / RATE8; // number of full blocks - i = 0; - while ( i < ITERS ) { - st, _, buf0, buf1, buf2, buf3 - = __addstate_imem_avx2x4(st, 0, buf0, buf1, buf2, buf3, RATE8, 0); - st = _keccakf1600_avx2x4(st); - i += 1; - } - - // last incomplete block - LEN = ALL % RATE8; - st, AT, buf0, buf1, buf2, buf3 - = __addstate_imem_avx2x4(st, 0, buf0, buf1, buf2, buf3, LEN, TRAILB); - if (TRAILB!=0) { st = __addratebit_avx2x4(st, RATE8); } - } - return st, AT, buf0, buf1, buf2, buf3; -} - - -/* - ONE-SHOT (FIXED-SIZE) MEMORY SQUEEZE - ==================================== -*/ -inline fn __dumpstate_imem_avx2x4 -( reg u64 buf0 buf1 buf2 buf3 -, inline int LEN -, reg const ptr u256[25] st -) -> reg u64 /* buf0 */ - , reg u64 /* buf1 */ - , reg u64 /* buf2 */ - , reg u64 /* buf3 */ -{ - reg u256 x0, x1, x2, x3; - reg u64 i, t0, t1, t2, t3; - i = 0; - while (i reg u64 /* buf0 */ - , reg u64 /* buf1 */ - , reg u64 /* buf2 */ - , reg u64 /* buf3 */ - , reg ptr u256[25] /* st */ -{ - reg u64 i; - inline int ITERS, LO; - ITERS = LEN/RATE8; - LO = LEN%RATE8; - if (0 reg u256[7] -{ - reg u256[9] t; - reg u256 c00 c14 d00 d14; - - reg bool zf; - reg u64 r iotas_o; - - reg ptr u256[24] iotas_p; - - iotas_p = KECCAK_IOTAS; - iotas_o = 0; - - r = KECCAK_ROUNDS; - while - { - //######################################## Theta - c00 = #VPSHUFD_256(state[2], (4u2)[1,0,3,2]); - c14 = state[5] ^ state[3]; - t[2] = state[4] ^ state[6]; - c14 = c14 ^ state[1]; - c14 = c14 ^ t[2]; - t[4] = #VPERMQ(c14, (4u2)[2,1,0,3]); - c00 = c00 ^ state[2]; - t[0] = #VPERMQ(c00, (4u2)[1,0,3,2]); - t[1] = c14 >>4u64 63; - t[2] = c14 +4u64 c14; - t[1] = t[1] | t[2]; - d14 = #VPERMQ(t[1], (4u2)[0,3,2,1]); - d00 = t[1] ^ t[4]; - d00 = #VPERMQ(d00, (4u2)[0,0,0,0]); - c00 = c00 ^ state[0]; - c00 = c00 ^ t[0]; - t[0] = c00 >>4u64 63; - t[1] = c00 +4u64 c00; - t[1] = t[1] | t[0]; - state[2] = state[2] ^ d00; - state[0] = state[0] ^ d00; - d14 = #VPBLEND_8u32(d14, t[1], (8u1)[1,1,0,0,0,0,0,0]); - t[4] = #VPBLEND_8u32(t[4], c00, (8u1)[0,0,0,0,0,0,1,1]); - d14 = d14 ^ t[4]; - - //######################################## Rho + Pi + pre-Chi shuffle - t[3] = #VPSLLV_4u64(state[2], KECCAK_RHOTATES_LEFT[0] ); - state[2] = #VPSRLV_4u64(state[2], KECCAK_RHOTATES_RIGHT[0] ); - state[2] = state[2] | t[3]; - state[3] = state[3] ^ d14; - t[4] = #VPSLLV_4u64(state[3], KECCAK_RHOTATES_LEFT[2] ); - state[3] = #VPSRLV_4u64(state[3], KECCAK_RHOTATES_RIGHT[2] ); - state[3] = state[3] | t[4]; - state[4] = state[4] ^ d14; - t[5] = #VPSLLV_4u64(state[4], KECCAK_RHOTATES_LEFT[3] ); - state[4] = #VPSRLV_4u64(state[4], KECCAK_RHOTATES_RIGHT[3] ); - state[4] = state[4] | t[5]; - state[5] = state[5] ^ d14; - t[6] = #VPSLLV_4u64(state[5], KECCAK_RHOTATES_LEFT[4] ); - state[5] = #VPSRLV_4u64(state[5], KECCAK_RHOTATES_RIGHT[4] ); - state[5] = state[5] | t[6]; - state[6] = state[6] ^ d14; - t[3] = #VPERMQ(state[2], (4u2)[2,0,3,1]); - t[4] = #VPERMQ(state[3], (4u2)[2,0,3,1]); - t[7] = #VPSLLV_4u64(state[6], KECCAK_RHOTATES_LEFT[5] ); - t[1] = #VPSRLV_4u64(state[6], KECCAK_RHOTATES_RIGHT[5] ); - t[1] = t[1] | t[7]; - state[1] = state[1] ^ d14; - t[5] = #VPERMQ(state[4], (4u2)[0,1,2,3]); - t[6] = #VPERMQ(state[5], (4u2)[1,3,0,2]); - t[8] = #VPSLLV_4u64(state[1], KECCAK_RHOTATES_LEFT[1] ); - t[2] = #VPSRLV_4u64(state[1], KECCAK_RHOTATES_RIGHT[1] ); - t[2] = t[2] | t[8]; - - //######################################## Chi - t[7] = #VPSRLDQ_256(t[1], 8); - t[0] = !t[1] & t[7]; - state[3] = #VPBLEND_8u32(t[2], t[6], (8u1)[0,0,0,0,1,1,0,0]); - t[8] = #VPBLEND_8u32(t[4], t[2], (8u1)[0,0,0,0,1,1,0,0]); - state[5] = #VPBLEND_8u32(t[3], t[4], (8u1)[0,0,0,0,1,1,0,0]); - t[7] = #VPBLEND_8u32(t[2], t[3], (8u1)[0,0,0,0,1,1,0,0]); - state[3] = #VPBLEND_8u32(state[3], t[4], (8u1)[0,0,1,1,0,0,0,0]); - t[8] = #VPBLEND_8u32(t[8], t[5], (8u1)[0,0,1,1,0,0,0,0]); - state[5] = #VPBLEND_8u32(state[5], t[2], (8u1)[0,0,1,1,0,0,0,0]); - t[7] = #VPBLEND_8u32(t[7], t[6], (8u1)[0,0,1,1,0,0,0,0]); - state[3] = #VPBLEND_8u32(state[3], t[5], (8u1)[1,1,0,0,0,0,0,0]); - t[8] = #VPBLEND_8u32(t[8], t[6], (8u1)[1,1,0,0,0,0,0,0]); - state[5] = #VPBLEND_8u32(state[5], t[6], (8u1)[1,1,0,0,0,0,0,0]); - t[7] = #VPBLEND_8u32(t[7], t[4], (8u1)[1,1,0,0,0,0,0,0]); - state[3] = !state[3] & t[8]; - state[5] = !state[5] & t[7]; - state[6] = #VPBLEND_8u32(t[5], t[2], (8u1)[0,0,0,0,1,1,0,0]); - t[8] = #VPBLEND_8u32(t[3], t[5], (8u1)[0,0,0,0,1,1,0,0]); - state[3] = state[3] ^ t[3]; - state[6] = #VPBLEND_8u32(state[6], t[3], (8u1)[0,0,1,1,0,0,0,0]); - t[8] = #VPBLEND_8u32(t[8], t[4], (8u1)[0,0,1,1,0,0,0,0]); - state[5] = state[5] ^ t[5]; - state[6] = #VPBLEND_8u32(state[6], t[4], (8u1)[1,1,0,0,0,0,0,0]); - t[8] = #VPBLEND_8u32(t[8], t[2], (8u1)[1,1,0,0,0,0,0,0]); - state[6] = !state[6] & t[8]; - state[6] = state[6] ^ t[6]; - state[4] = #VPERMQ(t[1], (4u2)[0,1,3,2]); - t[8] = #VPBLEND_8u32(state[4], state[0], (8u1)[0,0,1,1,0,0,0,0]); - state[1] = #VPERMQ(t[1], (4u2)[0,3,2,1]); - state[1] = #VPBLEND_8u32(state[1], state[0], (8u1)[1,1,0,0,0,0,0,0]); - state[1] = !state[1] & t[8]; - state[2] = #VPBLEND_8u32(t[4], t[5], (8u1)[0,0,0,0,1,1,0,0]); - t[7] = #VPBLEND_8u32(t[6], t[4], (8u1)[0,0,0,0,1,1,0,0]); - state[2] = #VPBLEND_8u32(state[2], t[6], (8u1)[0,0,1,1,0,0,0,0]); - t[7] = #VPBLEND_8u32(t[7], t[3], (8u1)[0,0,1,1,0,0,0,0]); - state[2] = #VPBLEND_8u32(state[2], t[3], (8u1)[1,1,0,0,0,0,0,0]); - t[7] = #VPBLEND_8u32(t[7], t[5], (8u1)[1,1,0,0,0,0,0,0]); - state[2] = !state[2] & t[7]; - state[2] = state[2] ^ t[2]; - t[0] = #VPERMQ(t[0], (4u2)[0,0,0,0]); - state[3] = #VPERMQ(state[3], (4u2)[0,1,2,3]); - state[5] = #VPERMQ(state[5], (4u2)[2,0,3,1]); - state[6] = #VPERMQ(state[6], (4u2)[1,3,0,2]); - state[4] = #VPBLEND_8u32(t[6], t[3], (8u1)[0,0,0,0,1,1,0,0]); - t[7] = #VPBLEND_8u32(t[5], t[6], (8u1)[0,0,0,0,1,1,0,0]); - state[4] = #VPBLEND_8u32(state[4], t[5], (8u1)[0,0,1,1,0,0,0,0]); - t[7] = #VPBLEND_8u32(t[7], t[2], (8u1)[0,0,1,1,0,0,0,0]); - state[4] = #VPBLEND_8u32(state[4], t[2], (8u1)[1,1,0,0,0,0,0,0]); - t[7] = #VPBLEND_8u32(t[7], t[3], (8u1)[1,1,0,0,0,0,0,0]); - state[4] = !state[4] & t[7]; - state[0] = state[0] ^ t[0]; - state[1] = state[1] ^ t[1]; - state[4] = state[4] ^ t[4]; - - //######################################## Iota - state[0] = state[0] ^ iotas_p.[(int) iotas_o]; - iotas_o += 32; - - _,_,_,zf,r = #DEC_64(r); - }(!zf) - - return state; -} - -/* -export fn testF(reg mut ptr u256[7] stm) -> reg ptr u256[7] -{ - reg u256[7] st; - inline int i; - for i = 0 to 7 { st[i] = stm[i]; } - st = _keccakf1600_avx2(st); - for i = 0 to 7 { stm[i] = st[i]; } - return stm; -} -*/ - -inline -fn _keccakf1600_avx2_(reg u256[7] state) -> reg u256[7] -{ - inline int i; - reg u256[7] st; - - for i = 0 to 7 { st[i] = state[i]; } - - st = _keccakf1600_avx2(st); - - for i = 0 to 7 { state[i] = st[i]; } - - return state; -} - - -// converts a (plain) keccak state (st25) into the avx2 representation -inline fn __stavx2_pack -( reg const ptr u64[25] st -) -> reg u256[7] { - // 3*r256 (evitáveis...) - reg u256[7] state; - reg u256 t256_0 t256_1 t256_2; - reg u128 t128_0, t128_1; - reg u64 r; - - // [ 0 0 0 0 ] - state[0] = #VPBROADCAST_4u64(st.[u64 8*0]); - // [ 1 2 3 4 ] - state[1] = st.[u256 1*8]; - // [ 5 - ] - t128_0 = #VMOV(st[5]); - // [ 6 7 8 9 ] - state[3] = st.[u256 6*8]; - // [ 10 - ] - t128_1 = #VMOV(st[10]); - // [ 11 12 13 14 ] - state[4] = st.[u256 11*8]; - // [ 5 15 ] - r = st[15]; - t128_0 = #VPINSR_2u64(t128_0, r, 1); - // [ 16 17 18 19 ] - state[5] = st.[u256 16*8]; - // [ 10 20 ] - r = st[20]; - t128_1 = #VPINSR_2u64(t128_1, r, 1); - // alternative not currently supported: VPGATHERDQ for filling state[2] - // [ 10 20 5 15 ] - state[2] = (2u128)[t128_0, t128_1]; - // [ 21 22 23 24 ] - state[6] = st.[u256 21*8]; - - // [ 16 7 8 19 ] - t256_0 = #VPBLEND_8u32(state[3], state[5], (8u1)[1,1,0,0,0,0,1,1]); - // [ 11 22 23 14 ] - t256_1 = #VPBLEND_8u32(state[6], state[4], (8u1)[1,1,0,0,0,0,1,1]); - // [ 6 12 13 9 ] - t256_2 = #VPBLEND_8u32(state[4], state[3], (8u1)[1,1,0,0,0,0,1,1]); - - // [ 16 7 23 14 ] - state[3] = #VPBLEND_8u32(t256_0, t256_1, (8u1)[1,1,1,1,0,0,0,0]); - // [ 11 22 8 19 ] - state[4] = #VPBLEND_8u32(t256_1, t256_0, (8u1)[1,1,1,1,0,0,0,0]); - - // [ 21 17 18 24 ] - t256_0 = #VPBLEND_8u32(state[5], state[6], (8u1)[1,1,0,0,0,0,1,1]); - - // [ 21 17 13 9 ] - state[5] = #VPBLEND_8u32(t256_0, t256_2, (8u1)[1,1,1,1,0,0,0,0]); - // [ 6 12 18 24 ] - state[6] = #VPBLEND_8u32(t256_2, t256_0, (8u1)[1,1,1,1,0,0,0,0]); - - // [ 0 0 0 0 ] - // [ 1 2 3 4 ] - // [ 10 20 5 15 ] - // [ 16 7 23 14 ] - // [ 11 22 8 19 ] - // [ 21 17 13 9 ] - // [ 6 12 18 24 ] - return state; -} - -// recovers a (plain) keccak state (st25) from an avx2-encoded one -inline fn __stavx2_unpack -( reg mut ptr u64[25] st -, reg u256[7] state -) -> reg ptr u64[25] { - // 5*r256 + 2*r128(evitáveis) (+7*r256) - reg u256 t256_0 t256_1 t256_2 t256_3 t256_4; - reg u128 t128_0, t128_1; - - // [ 0, 0 ] - t128_0 = (128u) state[0]; - st[0] = #VMOVLPD(t128_0); - // [ 1, 2, 3, 4 ] - st.[u256 1*8] = state[1]; - - // [ 16, 7, 8, 19 ] - t256_0 = #VPBLEND_8u32(state[3], state[4], (8u1)[1,1,1,1,0,0,0,0]); - // [ 11, 22, 23, 14 ] - t256_1 = #VPBLEND_8u32(state[4], state[3], (8u1)[1,1,1,1,0,0,0,0]); - // [ 21, 17, 18, 24 ] - t256_2 = #VPBLEND_8u32(state[5], state[6], (8u1)[1,1,1,1,0,0,0,0]); - // [ 6, 12, 13, 9 ] - t256_3 = #VPBLEND_8u32(state[6], state[5], (8u1)[1,1,1,1,0,0,0,0]); - - // [ 5, 15 ] -// state[2] = TTT[0]; - t128_1 = #VEXTRACTI128(state[2], 1); - st[5] = #VMOVLPD(t128_1); - - // [ 6, 7, 8, 9 ] - t256_4 = #VPBLEND_8u32(t256_0, t256_3, (8u1)[1,1,0,0,0,0,1,1]); - st.[u256 6*8] = t256_4; - - // [ 10, 20 ] - t128_0 = (128u) state[2]; - st[10] = #VMOVLPD(t128_0); - - // [ 11, 12, 13, 14 ] - t256_4 = #VPBLEND_8u32(t256_3, t256_1, (8u1)[1,1,0,0,0,0,1,1]); - st.[u256 11*8] = t256_4; - - // [ 15 ] - st[15] = #VMOVHPD(t128_1); - - // [ 16, 17, 18, 19 ] - t256_4 = #VPBLEND_8u32(t256_2, t256_0, (8u1)[1,1,0,0,0,0,1,1]); - st.[u256 16*8] = t256_4; - - // [ 20 ] - st[20] = #VMOVHPD(t128_0); - - // [ 21, 22, 23, 24 ] - t256_4 = #VPBLEND_8u32(t256_1, t256_2, (8u1)[1,1,0,0,0,0,1,1]); - st.[u256 21*8] = t256_4; - - return st; -} - diff --git a/code/jasmin/mlkem_avx2/keccak/keccakf1600_globals.jinc b/code/jasmin/mlkem_avx2/keccak/keccakf1600_globals.jinc deleted file mode 100644 index ccdcbb26..00000000 --- a/code/jasmin/mlkem_avx2/keccak/keccakf1600_globals.jinc +++ /dev/null @@ -1,36 +0,0 @@ -param int KECCAK_ROUNDS = 24; - -inline fn keccakf1600_index(inline int x y) -> inline int -{ - inline int r; - r = (x % 5) + 5 * (y % 5); - return r; -} - - -inline fn keccakf1600_rho_offsets(inline int i) -> inline int -{ - inline int r x y z t; - - r = 0; - x = 1; - y = 0; - - for t = 0 to 24 - { if (i == x + 5 * y) - { r = ((t + 1) * (t + 2) / 2) % 64; } - z = (2 * x + 3 * y) % 5; - x = y; - y = z; - } - - return r; -} - -inline fn keccakf1600_rhotates(inline int x y) -> inline int -{ - inline int i r; - i = keccakf1600_index(x, y); - r = keccakf1600_rho_offsets(i); - return r; -} diff --git a/code/jasmin/mlkem_avx2/keccak/keccakf1600x4_avx2.jinc b/code/jasmin/mlkem_avx2/keccak/keccakf1600x4_avx2.jinc deleted file mode 100644 index cde4d741..00000000 --- a/code/jasmin/mlkem_avx2/keccak/keccakf1600x4_avx2.jinc +++ /dev/null @@ -1,333 +0,0 @@ - -require "keccakf1600_globals.jinc" - -u256[24] KECCAK1600_RC_AVX2 = -{ (4u64)[0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001], - (4u64)[0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082], - (4u64)[0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a], - (4u64)[0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000], - (4u64)[0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b], - (4u64)[0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001], - (4u64)[0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081], - (4u64)[0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009], - (4u64)[0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a], - (4u64)[0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088], - (4u64)[0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009], - (4u64)[0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a], - (4u64)[0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b], - (4u64)[0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b], - (4u64)[0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089], - (4u64)[0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003], - (4u64)[0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002], - (4u64)[0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080], - (4u64)[0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a], - (4u64)[0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a], - (4u64)[0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081], - (4u64)[0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080], - (4u64)[0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001], - (4u64)[0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008] -}; - -u256 ROL56 = 0x181F1E1D1C1B1A191017161514131211080F0E0D0C0B0A090007060504030201; -u256 ROL8 = 0x1E1D1C1B1A19181F16151413121110170E0D0C0B0A09080F0605040302010007; - -// C[x] = A[x,0] ^ A[x,1] ^ A[x,2] ^ A[x,3] ^ A[x,4] -inline fn keccakf1600_4x_theta_sum(reg ptr u256[25] a) -> reg u256[5] -{ - inline int x y; - reg u256[5] c; - - // C[x] = A[x, 0] - for x=0 to 5 - { c[x] = a[x + 0]; } - - // C[x] ^= A[x,1] ^ A[x,2] ^ A[x,3] ^ A[x,4] - for y=1 to 5 - { for x=0 to 5 - { c[x] ^= a[x + y*5]; } - } - - return c; -} - -inline fn keccakf1600_4x_rol(reg u256[5] a, inline int x r, reg u256 r8 r56) -> reg u256[5] -{ - reg u256 t; - - if(r == 8) - { a[x] = #VPSHUFB_256(a[x], r8); } - else { if(r == 56) - { a[x] = #VPSHUFB_256(a[x], r56); } - else - { t = #VPSLL_4u64(a[x], r); - a[x] = #VPSRL_4u64(a[x], 64 - r); - a[x] |= t; } - } - - return a; -} - -// D[x] = C[x-1] ^ ROT(C[x+1], 1) -inline fn keccakf1600_4x_theta_rol(reg u256[5] c, reg u256 r8 r56) -> reg u256[5] -{ - inline int x; - reg u256[5] d; - - for x = 0 to 5 - { // D[x] = C[x + 1] - d[x] = c[(x + 1) % 5]; - - // D[x] = ROT(D[x], 1) - d = keccakf1600_4x_rol(d, x, 1, r8, r56); - - // D[x] ^= C[x-1] - d[x] ^= c[(x - 1 + 5) % 5]; - } - - return d; -} - - -// B[x] = ROT( (A[x',y'] ^ D[x']), r[x',y'] ) with (x',y') = M^-1 (x,y) -// -// M = (0 1) M^-1 = (1 3) x' = 1x + 3y -// (2 3) (1 0) y' = 1x + 0y -// -inline fn keccakf1600_4x_rol_sum( - reg ptr u256[25] a, - reg u256[5] d, - inline int y, - reg u256 r8 r56 -) -> reg u256[5] -{ - inline int r x x_ y_; - reg u256[5] b; - - for x = 0 to 5 - { - x_ = (x + 3*y) % 5; - y_ = x; - r = keccakf1600_rhotates(x_, y_); - - // B[x] = A[x',y'] - b[x] = a[x_ + y_*5]; - - // B[x] ^= D[x']; - b[x] ^= d[x_]; - - // B[x] = ROT( B[x], r[x',y'] ); - if(r != 0) - { b = keccakf1600_4x_rol(b, x, r, r8, r56); } - } - - return b; -} - - -// E[x, y] = B[x] ^ ( (!B[x+1]) & B[x+2] ) -// -- when x and y are 0: E[0,0] ^= RC[i]; -inline fn keccakf1600_4x_set_row( - reg ptr u256[25] e, - reg u256[5] b, - inline int y, - reg u256 rc -) -> reg ptr u256[25] -{ - inline int x x1 x2; - reg u256 t; - - for x=0 to 5 - { - x1 = (x + 1) % 5; - x2 = (x + 2) % 5; - - t = #VPANDN_256(b[x1], b[x2]); - - t ^= b[x]; - if( x==0 && y==0 ){ t ^= rc; } - e[x + y*5] = t; - } - - return e; -} - - -fn keccakf1600_4x_round(reg ptr u256[25] e a, reg u256 rc r8 r56) -> reg ptr u256[25] -{ - inline int y; - reg u256[5] b c d; - - c = keccakf1600_4x_theta_sum(a); - d = keccakf1600_4x_theta_rol(c, r8, r56); - - for y = 0 to 5 - { b = keccakf1600_4x_rol_sum(a, d, y, r8, r56); - e = keccakf1600_4x_set_row(e, b, y, rc); - } - - return e; -} - -//////////////////////////////////////////////////////////////////////////////// - -inline fn __keccakf1600_avx2x4(reg ptr u256[25] a) -> reg ptr u256[25] -{ - #mmx reg ptr u256[25] a_s; - - reg ptr u256[24] RC; - - stack u256[25] s_e; - reg ptr u256[25] e; - - reg u256 rc r8 r56; - reg u64 c; - - RC = KECCAK1600_RC_AVX2; - e = s_e; - r8 = ROL8; - r56 = ROL56; - - c = 0; - while(c < (KECCAK_ROUNDS*32)) - { - rc = RC.[(int) c]; - e = keccakf1600_4x_round(e, a, rc, r8, r56); - - // just an expensive pointer swap (#todo request feature) - a_s = a; s_e = e; - a = a_s; e = s_e; - - rc = RC.[(int) c + 32]; - a = keccakf1600_4x_round(a, e, rc, r8, r56); - - // just an expensive pointer swap (#todo request feature) - a_s = a; s_e = e; - a = a_s; e = s_e; - - c += 64; - } - - return a; -} - -fn _keccakf1600_avx2x4(reg ptr u256[25] a) -> reg ptr u256[25] -{ - a = __keccakf1600_avx2x4(a); - return a; -} - -inline fn _keccakf1600_avx2x4_(reg ptr u256[25] a) -> reg ptr u256[25] -{ - a = a; - a = _keccakf1600_avx2x4(a); - a = a; - return a; -} - -/* -// pack 4 keccak states (st25) into a 4-way state (st4x) -inline fn __u256x4_4u64x4 -( reg u256 x0 x1 x2 x3 -) -> reg u256, reg u256, reg u256, reg u256 { - // x0 = l00 l01 l02 l03 - // x1 = l10 l11 l12 l13 - // x2 = l20 l21 l22 l23 - // x3 = l30 l31 l32 l33 - reg u256 y0, y1, y2, y3; - y0 = #VPUNPCKL_4u64(x0, x1); // y0 = l00 l10 l02 l12 - y1 = #VPUNPCKH_4u64(x0, x1); // y1 = l01 l11 l03 l13 - y2 = #VPUNPCKL_4u64(x2, x3); // y2 = l20 l30 l22 l32 - y3 = #VPUNPCKH_4u64(x2, x3); // y3 = l21 l31 l23 l33 - - x0 = #VPERM2I128(y0, y2, 0x20); // x0 = l00 l10 l20 l30 - x1 = #VPERM2I128(y1, y3, 0x20); // x1 = l01 l11 l21 l31 - x2 = #VPERM2I128(y0, y2, 0x31); // x2 = l02 l12 l22 l32 - x3 = #VPERM2I128(y1, y3, 0x31); // x3 = l03 l13 l23 l33 - - return x0, x1, x2, x3; -} - -inline fn __st4x_pack -( reg mut ptr u256[25] st4x -, reg const ptr u64[25] st0 st1 st2 st3 -) -> reg ptr u256[25] { - inline int i; - reg u256 x0, x1, x2, x3; - reg u64 t0, t1, t2, t3; - for i = 0 to 6 { - x0 = st0[u256 i]; - x1 = st1[u256 i]; - x2 = st2[u256 i]; - x3 = st3[u256 i]; - x0, x1, x2, x3 = __u256x4_4u64x4(x0, x1, x2, x3); - st4x[4*i+0] = x0; - st4x[4*i+1] = x1; - st4x[4*i+2] = x2; - st4x[4*i+3] = x3; - } - t0 = st0[24]; - t1 = st1[24]; - t2 = st2[24]; - t3 = st3[24]; - st4x[u64 4*24+0] = t0; - st4x[u64 4*24+1] = t1; - st4x[u64 4*24+2] = t2; - st4x[u64 4*24+3] = t3; - - return st4x; -} - - - -// extracts 4 keccak states (st25) from a 4-way state (st4x) -inline fn __4u64x4_u256x4 -( reg u256 y0 y1 y2 y3 -) -> reg u256, reg u256, reg u256, reg u256 { - // y0 = l00 l10 l20 l30 - // y1 = l01 l11 l21 l31 - // y2 = l02 l12 l22 l32 - // y3 = l03 l13 l23 l33 - reg u256 x0, x1, x2, x3; - x0 = #VPERM2I128(y0, y2, 0x20); // x0 = l00 l10 l02 l12 - x1 = #VPERM2I128(y1, y3, 0x20); // x1 = l01 l11 l03 l13 - x2 = #VPERM2I128(y0, y2, 0x31); // x2 = l20 l30 l22 l32 - x3 = #VPERM2I128(y1, y3, 0x31); // x3 = l21 l31 l23 l33 - - y0 = #VPUNPCKL_4u64(x0, x1); // y0 = l00 l01 l02 l03 - y1 = #VPUNPCKH_4u64(x0, x1); // y1 = l10 l11 l12 l13 - y2 = #VPUNPCKL_4u64(x2, x3); // y2 = l20 l21 l22 l23 - y3 = #VPUNPCKH_4u64(x2, x3); // y3 = l30 l31 l32 l33 - - return y0, y1, y2, y3; -} - -inline fn __st4x_unpack -( reg mut ptr u64[25] st0 st1 st2 st3 -, reg const ptr u256[25] st4x -) -> reg ptr u64[25], reg ptr u64[25], reg ptr u64[25], reg ptr u64[25] { - inline int i; - reg u256 x0, x1, x2, x3; - reg u64 t0, t1, t2, t3; - for i = 0 to 6 { - x0 = st4x[u256 4*i+0]; - x1 = st4x[u256 4*i+1]; - x2 = st4x[u256 4*i+2]; - x3 = st4x[u256 4*i+3]; - x0, x1, x2, x3 = __4u64x4_u256x4(x0, x1, x2, x3); - st0.[u256 4*8*i] = x0; - st1.[u256 4*8*i] = x1; - st2.[u256 4*8*i] = x2; - st3.[u256 4*8*i] = x3; - } - t0 = st4x[u64 4*24+0]; - t1 = st4x[u64 4*24+1]; - t2 = st4x[u64 4*24+2]; - t3 = st4x[u64 4*24+3]; - st0.[u64 8*24] = t0; - st1.[u64 8*24] = t1; - st2.[u64 8*24] = t2; - st3.[u64 8*24] = t3; - - return st0, st1, st2, st3; -} -*/ diff --git a/code/jasmin/mlkem_avx2/keccak/subreadwrite_array_ASIZE.jinc b/code/jasmin/mlkem_avx2/keccak/subreadwrite_array_ASIZE.jinc deleted file mode 100644 index d33937e2..00000000 --- a/code/jasmin/mlkem_avx2/keccak/subreadwrite_array_ASIZE.jinc +++ /dev/null @@ -1,261 +0,0 @@ -/** - READ A FIXED NUMBER OF BYTES INTO A WORD -**/ - -inline fn __aread_subu64 -( reg const ptr u8[ASIZE] buf -, reg u64 offset -, inline int DELTA -, inline int LEN -, inline int TRAIL -) -> inline int /* DELTA */ - , inline int /* LEN */ - , inline int /* TRAIL */ - , reg u64 /* w */ -{ - reg u64 w, t16, t8; - inline int ILEN; - ILEN = LEN; - if (LEN <=s 0) { - w = TRAIL; - TRAIL = 0; - } else if (8 <=s LEN) { - w = buf.[u64 offset + DELTA]; - DELTA += 8; - LEN -= 8; - } else { - if (4 <=s LEN) { - w = (64u) buf.[u32 offset + DELTA]; - DELTA += 4; - LEN -= 4; - } else { - w = 0; - } - if (2 <=s LEN) { - t16 = (64u) buf.[u16 offset + DELTA]; - DELTA += 2; - LEN -= 2; - } else { - t16 = 0; - } - if (1 <=s LEN || TRAIL != 0) { - if (1 <=s LEN) { - t8 = (64u) buf.[u8 offset + DELTA]; - if (TRAIL != 0) { t8 |= 256*TRAIL; } - DELTA += 1; - LEN -= 1; - } else { - t8 = TRAIL; - } - TRAIL = 0; - t8 <<= 8*(2*((ILEN/2) % 2)); - t16 |= t8; - } - t16 <<= 8*(4*((ILEN/4) % 2)); - w |= t16; - } - return DELTA, LEN, TRAIL, w; -} - -inline fn __aread_bcast_4subu64 -( reg const ptr u8[ASIZE] buf -, reg u64 offset -, inline int DELTA -, inline int LEN -, inline int TRAIL -) -> inline int /* DELTA */ - , inline int /* LEN */ - , inline int /* TRAIL */ - , reg u256 /* w */ -{ - reg u64 t64; - reg u128 t128; - reg u256 w; - if (LEN <=s 0 && TRAIL==0) { - w = #set0_256(); - } else { - if (8 <= LEN) { - w = #VPBROADCAST_4u64(buf.[u64 offset + DELTA]); - DELTA += 8; - LEN -= 8; - } else { - DELTA, LEN, TRAIL, t64 = __aread_subu64(buf, offset, DELTA, LEN, TRAIL); - t128 = (128u) t64; - w = #VPBROADCAST_4u64(t128); - } - } - return DELTA, LEN, TRAIL, w; -} - -inline fn __aread_subu128 -( reg const ptr u8[ASIZE] buf -, reg u64 offset -, inline int DELTA -, inline int LEN -, inline int TRAIL -) -> inline int /* DELTA */ - , inline int /* LEN */ - , inline int /* TRAIL */ - , reg u128 /* w */ -{ - reg u128 w; - reg u64 t64; - if (LEN <=s 0 && TRAIL==0) { - w = #set0_128(); - } else if (16 <=s LEN) { - w = buf.[u128 offset + DELTA]; - DELTA += 16; - LEN -= 16; - } else { - if (8 <=s LEN) { - w = #VMOV(buf.[u64 offset + DELTA]); - DELTA += 8; - LEN -= 8; - DELTA, LEN, TRAIL, t64 = __aread_subu64(buf, offset, DELTA, LEN, TRAIL); - w = #VPINSR_2u64(w, t64, 1); - } else { - DELTA, LEN, TRAIL, t64 = __aread_subu64(buf, offset, DELTA, LEN, TRAIL); - w = (128u) t64; - } - } - return DELTA, LEN, TRAIL, w; -} - -inline fn __aread_subu256 -( reg const ptr u8[ASIZE] buf -, reg u64 offset -, inline int DELTA -, inline int LEN -, inline int TRAIL -) -> inline int /* DELTA */ - , inline int /* LEN */ - , inline int /* TRAIL */ - , reg u256 /* w */ -{ - reg u256 w; - reg u128 t128_0, t128_1; - if (LEN <=s 0 && TRAIL==0) { - w = #set0_256(); - } else if (32 <=s LEN) { - w = buf.[u256 offset + DELTA]; - DELTA += 32; - LEN -= 32; - } else { - if (16 <=s LEN) { - t128_0 = buf.[u128 offset + DELTA]; - DELTA += 16; - LEN -= 16; - DELTA, LEN, TRAIL, t128_1 = __aread_subu128(buf, offset, DELTA, LEN, TRAIL); - w = (2u128)[t128_1, t128_0]; - } else { - t128_1 = #set0_128(); - DELTA, LEN, TRAIL, t128_0 = __aread_subu128(buf, offset, DELTA, LEN, TRAIL); - w = (2u128)[t128_1, t128_0]; - } - } - return DELTA, LEN, TRAIL, w; -} - - -/** - WRITE A FIXED NUMBER OF BYTES FROM A WORD -**/ -inline fn __awrite_subu64 -( reg mut ptr u8[ASIZE] buf -, reg u64 offset -, inline int DELTA -, inline int LEN -, reg u64 w -) -> reg ptr u8[ASIZE] /* buf */ - , inline int /* DELTA */ - , inline int /* LEN */ -{ - if (0 >= 32; - DELTA += 4; - LEN -= 4; - } - if (2 <=s LEN) { - buf.[u16 offset + DELTA] = (16u) w; - w >>= 16; - DELTA += 2; - LEN -= 2; - } - if (1 <=s LEN) { - buf.[u8 offset + DELTA] = (8u) w; - DELTA += 1; - LEN -= 1; - } - } - } - return buf, DELTA, LEN; -} - -inline fn __awrite_subu128 -( reg mut ptr u8[ASIZE] buf -, reg u64 offset -, inline int DELTA -, inline int LEN -, reg u128 w -) -> reg ptr u8[ASIZE] /* buf */ - , inline int /* DELTA */ - , inline int /* LEN */ -{ - reg u64 t64; - if (0 reg ptr u8[ASIZE] /* buf */ - , inline int /* DELTA */ - , inline int /* LEN */ -{ - reg u128 t128; - if (0 reg u64 /* buf */ - , inline int /* LEN */ - , inline int /* TRAIL */ - , reg u64 /* w */ -{ - reg u64 w, t16, t8; - inline int ILEN; - ILEN = LEN; - if (LEN <=s 0) { - w = TRAIL; - TRAIL = 0; - } else if (8 <=s LEN) { - w = (u64)[buf]; - buf += 8; - LEN -= 8; - } else { - if (4 <=s LEN) { - w = (64u) (u32)[buf]; - buf += 4; - LEN -= 4; - } else { - w = 0; - } - if (2 <=s LEN) { - t16 = (64u) (u16)[buf]; - buf += 2; - LEN -= 2; - } else { - t16 = 0; - } - if (1 <=s LEN || TRAIL != 0) { - if (1 <=s LEN) { - t8 = (64u) (u8)[buf]; - if (TRAIL != 0) { t8 |= 256*TRAIL; } - buf += 1; - LEN -= 1; - } else { - t8 = TRAIL; - } - TRAIL = 0; - t8 <<= 8*(2*((ILEN/2) % 2)); - t16 |= t8; - } - t16 <<= 8*(4*((ILEN/4) % 2)); - w |= t16; - } - return buf, LEN, TRAIL, w; -} - -inline fn __mread_bcast_4subu64 -( reg u64 buf -, inline int LEN -, inline int TRAIL -) -> reg u64 /* buf */ - , inline int /* LEN */ - , inline int /* TRAIL */ - , reg u256 /* w */ -{ - reg u64 t64; - reg u128 t128; - reg u256 w; - if (LEN <=s 0 && TRAIL==0) { - w = #set0_256(); - } else { - if (8 <= LEN) { - w = #VPBROADCAST_4u64((u64)[buf]); - buf += 8; - LEN -= 8; - } else { - buf, LEN, TRAIL, t64 = __mread_subu64(buf, LEN, TRAIL); - t128 = (128u) t64; - w = #VPBROADCAST_4u64(t128); - } - } - return buf, LEN, TRAIL, w; -} - -inline fn __mread_subu128 -( reg u64 buf -, inline int LEN -, inline int TRAIL -) -> reg u64 /* buf */ - , inline int /* LEN */ - , inline int /* TRAIL */ - , reg u128 /* w */ -{ - reg u128 w; - reg u64 t64; - if (LEN <=s 0 && TRAIL==0) { - w = #set0_128(); - } else if (16 <=s LEN) { - w = (u128) [buf]; - buf += 16; - LEN -= 16; - } else { - if (8 <=s LEN) { - w = #VMOV((u64)[buf]); - buf += 8; - LEN -= 8; - buf, LEN, TRAIL, t64 = __mread_subu64(buf, LEN, TRAIL); - w = #VPINSR_2u64(w, t64, 1); - } else { - buf, LEN, TRAIL, t64 = __mread_subu64(buf, LEN, TRAIL); - w = (128u) t64; - } - } - return buf, LEN, TRAIL, w; -} - -inline fn __mread_subu256 -( reg u64 buf -, inline int LEN -, inline int TRAIL -) -> reg u64 /* buf */ - , inline int /* LEN */ - , inline int /* TRAIL */ - , reg u256 /* w */ -{ - reg u256 w; - reg u128 t128_0, t128_1; - if (LEN <=s 0 && TRAIL==0) { - w = #set0_256(); - } else if (32 <=s LEN) { - w = (u256)[buf]; - buf += 32; - LEN -= 32; - } else { - if (16 <=s LEN) { - t128_0 = (u128) [buf]; - buf += 16; - LEN -= 16; - buf, LEN, TRAIL, t128_1 = __mread_subu128(buf, LEN, TRAIL); - w = (2u128)[t128_1, t128_0]; - } else { - t128_1 = #set0_128(); - buf, LEN, TRAIL, t128_0 = __mread_subu128(buf, LEN, TRAIL); - w = (2u128)[t128_1, t128_0]; - } - } - return buf, LEN, TRAIL, w; -} - - -/** - WRITE A FIXED NUMBER OF BYTES FROM A WORD -**/ -inline fn __mwrite_subu64 -( reg u64 buf -, inline int LEN -, reg u64 w -) -> reg u64 /* buf */ - , inline int /* LEN */ -{ - if (0 >= 32; - buf += 4; - LEN -= 4; - } - if (2 <=s LEN) { - (u16)[buf] = (16u) w; - w >>= 16; - buf += 2; - LEN -= 2; - } - if (1 <=s LEN) { - (u8)[buf] = (8u) w; - buf += 1; - LEN -= 1; - } - } - } - return buf, LEN; -} - -inline fn __mwrite_subu128 -( reg u64 buf -, inline int LEN -, reg u128 w -) -> reg u64 /* buf */ - , inline int /* LEN */ -{ - reg u64 t64; - if (0 reg u64 /* buf */ - , inline int /* LEN */ -{ - reg u128 t128; - if (0 reg ptr u64[25] -{ - inline int i; - - for i = 0 to 25 { - state[i] = 0; - } - - return state; -} - - -inline -fn __add_full_block( - stack u64[25] state, - reg u64 in, - reg u64 inlen, - reg u64 r8 -) -> stack u64[25], reg u64, reg u64 -{ - reg u64 i t r64; - - r64 = r8; - r64 >>= 3; - i = 0; - while (i < r64) - { - t = [in + 8 * i]; - state[(int) i] ^= t; - i = i + 1; - } - - in += r8; - inlen -= r8; - - return state, in, inlen; -} - - -inline -fn __add_final_block( - stack u64[25] state, - reg u64 in, - reg u64 inlen, - reg u8 trail_byte, - reg u64 r8 -) -> stack u64[25] -{ - reg u64 i, t, inlen8; - reg u8 c; - - inlen8 = inlen; - inlen8 >>= 3; - i = 0; - while ( i < inlen8) - { - t = [in + 8*i]; - state[(int) i] ^= t; - i = i + 1; - } - - i <<= 3; - while (i < inlen) - { - c = (u8)[in + i]; - state[u8 (int) i] ^= c; - i = i + 1; - } - - state[u8 (int) i] ^= trail_byte; - - i = r8; - i -= 1; - state[u8 (int) i] ^= 0x80; - - return state; -} - -fn _isha3_256( - #spill_to_mmx reg ptr u8[32] out, - #spill_to_mmx reg u64 in inlen) - -> - reg ptr u8[32] -{ - stack u64[25] state; - #spill_to_mmx reg u64 ilen r8 t64; - reg u8 t8; - inline int i; - - () = #spill(out); - - state = __st0(state); - - r8 = SHA3_256_RATE; - ilen = inlen; - - while(ilen >= r8) - { - state, in, ilen = __add_full_block(state, in, ilen, r8); - - () = #spill(in, ilen, r8); - - state = _keccakf1600_(state); - - () = #unspill(in, ilen, r8); - } - - t8 = 0x06; - state = __add_final_block(state, in, ilen, t8, r8); - - state = _keccakf1600_(state); - - () = #unspill(out); - - for i=0 to 4 - { t64 = state[i]; - out[u64 i] = t64; - } - - return out; -} - - -fn _shake256_1120_32(#spill_to_mmx reg u64 out in0 in1) -{ - stack u64[25] state; - #spill_to_mmx reg u64 ilen r8 t64; - reg u8 t8; - inline int i; - - () = #spill(out); - - state = __st0(state); - - for i = 0 to MLKEM_SYMBYTES/8 { - t64 = (u64)[in0 + i*8]; - state[u64 i] ^= t64; - } - - for i = MLKEM_SYMBYTES/8 to SHAKE256_RATE/8 { - t64 = (u64)[in1 + (i-MLKEM_SYMBYTES/8)*8]; - state[u64 i] ^= t64; - } - - () = #spill(in1); - - state = _keccakf1600_(state); - - () = #unspill(in1); - - r8 = SHAKE256_RATE; - ilen = MLKEM_INDCPA_CIPHERTEXTBYTES - (SHAKE256_RATE - MLKEM_SYMBYTES); - in1 += SHAKE256_RATE - MLKEM_SYMBYTES; - - while(ilen >= r8) - { - state, in1, ilen = __add_full_block(state, in1, ilen, r8); - - () = #spill(in1, ilen, r8); - - state = _keccakf1600_(state); - - () = #unspill(in1, ilen, r8); - } - - t8 = 0x1f; - state = __add_final_block(state, in1, ilen, t8, r8); - - state = _keccakf1600_(state); - - () = #unspill(out); - - for i=0 to MLKEM_SYMBYTES/8 - { - t64 = state[i]; - (u64)[out + 8*i] = t64; - } - -} - -fn _shake256_128_33( - #spill_to_mmx reg ptr u8[128] out, - reg const ptr u8[33] in) - -> - stack u8[128] -{ - stack u64[25] state; - reg u64 t64; - reg u8 c; - inline int i; - - () = #spill(out); - - state = __st0(state); - - for i = 0 to 4 { - t64 = in[u64 i]; - state[u64 i] ^= t64; - } - - c = in[32]; - state[u8 32] ^= c; - state[u8 33] ^= 0x1f; - state[u8 SHAKE256_RATE-1] ^= 0x80; - - state = _keccakf1600_(state); - - () = #spill(out); - - for i = 0 to 16 { - t64 = state[u64 i]; - out[u64 i] = t64; - } - - return out; -} - -fn _isha3_256_32( - #spill_to_mmx reg ptr u8[32] out, - reg ptr u8[MLKEM_SYMBYTES] in) - -> - reg ptr u8[32] -{ - stack u64[25] state; - reg u64 t64; - inline int i; - - () = #spill(out); - - state = __st0(state); - - for i=0 to MLKEM_SYMBYTES/8 - { - t64 = in[u64 i]; - state[u64 i] = t64; - } - - state[u8 MLKEM_SYMBYTES] ^= 0x06; - state[u8 SHA3_256_RATE - 1] = 0x80; - - state = _keccakf1600_(state); - - () = #unspill(out); - - for i=0 to 4 - { - t64 = state[i]; - out[u64 i] = t64; - } - - return out; -} - -fn _sha3_512_64( - #spill_to_mmx reg ptr u8[64] out, - reg const ptr u8[64] in) - -> - reg ptr u8[64] -{ - stack u64[25] state; - reg u64 t64; - inline int i; - - () = #spill(out); - - state = __st0(state); - - for i = 0 to 8 - { - t64 = in[u64 i]; - state[i] ^= t64; - } - - state[u8 64] ^= 0x06; - state[u8 SHA3_512_RATE - 1] ^= 0x80; - - state = _keccakf1600_(state); - - () = #unspill(out); - - for i = 0 to 8 - { - t64 = state[i]; - out[u64 i] = t64; - } - - return out; -} - -fn _sha3_512_32( - #spill_to_mmx reg ptr u8[64] out, - reg const ptr u8[32] in) - -> - reg ptr u8[64] -{ - stack u64[25] state; - reg u64 t64; - inline int i; - - () = #spill(out); - - state = __st0(state); - - for i = 0 to 4 - { - t64 = in[u64 i]; - state[i] ^= t64; - } - - state[u8 32] ^= 0x06; - state[u8 SHA3_512_RATE-1] ^= 0x80; - - state = _keccakf1600_(state); - - () = #unspill(out); - - for i = 0 to 8 { - t64 = state[i]; - out[u64 i] = t64; - } - - return out; -} - -fn _shake128_absorb34(reg ptr u64[25] state, reg const ptr u8[34] in) -> reg ptr u64[25] -{ - reg u64 t64; - reg u16 t16; - inline int i; - - state = __st0(state); - - for i = 0 to 4 - { - t64 = in[u64 i]; - state[u64 i] ^= t64; - } - - t16 = in.[u16 32]; - state[u16 16] ^= t16; - - state[u8 34] ^= 0x1f; - - state[u8 SHAKE128_RATE-1] ^= 0x80; - - return state; -} - -fn _shake128_squeezeblock( - reg ptr u64[25] state, - #spill_to_mmx reg ptr u8[SHAKE128_RATE] out) - -> - reg ptr u64[25], - reg ptr u8[SHAKE128_RATE] -{ - reg u64 t; - inline int i; - - () = #spill(out); - - state = _keccakf1600_(state); - - () = #unspill(out); - - for i = 0 to SHAKE128_RATE/8 - { - t = state[i]; - out[u64 i] = t; - } - return state, out; -} diff --git a/code/jasmin/mlkem_avx2/keccak_OLD/fips202_4x.jinc b/code/jasmin/mlkem_avx2/keccak_OLD/fips202_4x.jinc deleted file mode 100644 index c2a9a2f4..00000000 --- a/code/jasmin/mlkem_avx2/keccak_OLD/fips202_4x.jinc +++ /dev/null @@ -1,1456 +0,0 @@ -require "fips202_common.jinc" - -u256 rho56 = 0x181F1E1D1C1B1A191017161514131211080F0E0D0C0B0A090007060504030201; -u256 rho8 = 0x1E1D1C1B1A19181F16151413121110170E0D0C0B0A09080F0605040302010007; - -inline fn __rol_4u64_rho56(reg u256 a) -> reg u256 -{ - reg u256 r; - - r = #VPSHUFB_256(a, rho56); - - return r; -} - - -inline fn __rol_4u64_rho8(reg u256 a) -> reg u256 -{ - reg u256 r; - - r = #VPSHUFB_256(a, rho8); - - return r; -} - - -inline fn __rol_4u64(reg u256 a, inline int o) -> reg u256 -{ - reg u256 r; - reg u256 t256; - - r = #VPSLL_4u64(a, o); - t256 = #VPSRL_4u64(a, 64 - o); - - r |= t256; - - return r; -} - - -param int ba=0; -param int be=1; -param int bi=2; -param int bo=3; -param int bu=4; -param int ga=5; -param int ge=6; -param int gi=7; -param int go=8; -param int gu=9; -param int ka=10; -param int ke=11; -param int ki=12; -param int ko=13; -param int ku=14; -param int ma=15; -param int me=16; -param int mi=17; -param int mo=18; -param int mu=19; -param int sa=20; -param int se=21; -param int si=22; -param int so=23; -param int su=24; - -u256[24] KeccakF1600RoundConstants = { - 0x0000000000000001000000000000000100000000000000010000000000000001, - 0x0000000000008082000000000000808200000000000080820000000000008082, - 0x800000000000808a800000000000808a800000000000808a800000000000808a, - 0x8000000080008000800000008000800080000000800080008000000080008000, - 0x000000000000808b000000000000808b000000000000808b000000000000808b, - 0x0000000080000001000000008000000100000000800000010000000080000001, - 0x8000000080008081800000008000808180000000800080818000000080008081, - 0x8000000000008009800000000000800980000000000080098000000000008009, - 0x000000000000008a000000000000008a000000000000008a000000000000008a, - 0x0000000000000088000000000000008800000000000000880000000000000088, - 0x0000000080008009000000008000800900000000800080090000000080008009, - 0x000000008000000a000000008000000a000000008000000a000000008000000a, - 0x000000008000808b000000008000808b000000008000808b000000008000808b, - 0x800000000000008b800000000000008b800000000000008b800000000000008b, - 0x8000000000008089800000000000808980000000000080898000000000008089, - 0x8000000000008003800000000000800380000000000080038000000000008003, - 0x8000000000008002800000000000800280000000000080028000000000008002, - 0x8000000000000080800000000000008080000000000000808000000000000080, - 0x000000000000800a000000000000800a000000000000800a000000000000800a, - 0x800000008000000a800000008000000a800000008000000a800000008000000a, - 0x8000000080008081800000008000808180000000800080818000000080008081, - 0x8000000000008080800000000000808080000000000080808000000000008080, - 0x0000000080000001000000008000000100000000800000010000000080000001, - 0x8000000080008008800000008000800880000000800080088000000080008008 - }; - -inline fn __prepare_theta(reg ptr u256[25] A_4x) -> reg u256, reg u256, reg u256, reg u256, reg u256 -{ - reg u256 Ca, Ce, Ci, Co, Cu; - - // Ca = XOR256(Aba, XOR256(Aga, XOR256(Aka, XOR256(Ama, Asa)))); - Ca = A_4x[sa]; - Ca ^= A_4x[ma]; - Ca ^= A_4x[ka]; - Ca ^= A_4x[ga]; - Ca ^= A_4x[ba]; - - // Ce = XOR256(Abe, XOR256(Age, XOR256(Ake, XOR256(Ame, Ase)))); - Ce = A_4x[se]; - Ce ^= A_4x[me]; - Ce ^= A_4x[ke]; - Ce ^= A_4x[ge]; - Ce ^= A_4x[be]; - - // Ci = XOR256(Abi, XOR256(Agi, XOR256(Aki, XOR256(Ami, Asi)))); - Ci = A_4x[si]; - Ci ^= A_4x[mi]; - Ci ^= A_4x[ki]; - Ci ^= A_4x[gi]; - Ci ^= A_4x[bi]; - - // Co = XOR256(Abo, XOR256(Ago, XOR256(Ako, XOR256(Amo, Aso)))); - Co = A_4x[so]; - Co ^= A_4x[mo]; - Co ^= A_4x[ko]; - Co ^= A_4x[go]; - Co ^= A_4x[bo]; - - // Cu = XOR256(Abu, XOR256(Agu, XOR256(Aku, XOR256(Amu, Asu)))); - Cu = A_4x[su]; - Cu ^= A_4x[mu]; - Cu ^= A_4x[ku]; - Cu ^= A_4x[gu]; - Cu ^= A_4x[bu]; - - return Ca, Ce, Ci, Co, Cu; -} - -inline fn __first(reg u256 Ca, reg u256 Ce, reg u256 Ci, reg u256 Co, reg u256 Cu) -> reg u256, reg u256, reg u256, reg u256, reg u256 -{ - reg u256 Da, De, Di, Do, Du; - reg u256 Ca1, Ce1, Ci1, Co1, Cu1; - - Ce1 = __rol_4u64(Ce, 1); - Da = Cu ^ Ce1; - - Ci1 = __rol_4u64(Ci, 1); - De = Ca ^ Ci1; - - Co1 = __rol_4u64(Co, 1); - Di = Ce ^ Co1; - - Cu1 = __rol_4u64(Cu, 1); - Do = Ci ^ Cu1; - - Ca1 = __rol_4u64(Ca, 1); - Du = Co ^ Ca1; - - return Da, De, Di, Do, Du; -} - - -inline fn __second_even( -reg ptr u256[25] A_4x, reg ptr u256[25] E_4x, inline int index, -reg u256 Ca, reg u256 Ce, reg u256 Ci, reg u256 Co, reg u256 Cu, -reg u256 Da, reg u256 De, reg u256 Di, reg u256 Do, reg u256 Du) --> reg ptr u256[25], reg ptr u256[25], reg u256, reg u256, reg u256, reg u256, reg u256 -{ - reg u256 Bba, Bbe, Bbi, Bbo, Bbu; - reg u256 t256; - - t256 = A_4x[ba]; - t256 ^= Da; - A_4x[ba] = t256; - Bba = t256; - - t256 = A_4x[ge]; - t256 ^= De; - A_4x[ge] = t256; - Bbe = __rol_4u64(t256, 44); - - t256 = A_4x[ki]; - t256 ^= Di; - A_4x[ki] = t256; - Bbi = __rol_4u64(t256, 43); - - // E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); - t256 = #VPANDN_256(Bbe, Bbi); - t256 ^= Bba; - t256 ^= KeccakF1600RoundConstants[index]; - E_4x[ba] = t256; - - Ca = t256; - - t256 = A_4x[mo]; - t256 ^= Do; - A_4x[mo] = t256; - Bbo = __rol_4u64(t256, 21); - - // E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); - t256 = #VPANDN_256(Bbi, Bbo); - t256 ^= Bbe; - E_4x[be] = t256; - - Ce = t256; - - t256 = A_4x[su]; - t256 ^= Du; - A_4x[su] = t256; - Bbu = __rol_4u64(t256, 14); - - // E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); - t256 = #VPANDN_256(Bbo, Bbu); - t256 ^= Bbi; - E_4x[bi] = t256; - - Ci = t256; - - // E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); - t256 = #VPANDN_256(Bbu, Bba); - t256 ^= Bbo; - E_4x[bo] = t256; - - Co = t256; - - // E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); - t256 = #VPANDN_256(Bba, Bbe); - t256 ^= Bbu; - E_4x[bu] = t256; - - Cu = t256; - - return A_4x, E_4x, Ca, Ce, Ci, Co, Cu; -} - -inline fn __third_even( -reg ptr u256[25] A_4x, reg ptr u256[25] E_4x, -reg u256 Ca, reg u256 Ce, reg u256 Ci, reg u256 Co, reg u256 Cu, -reg u256 Da, reg u256 De, reg u256 Di, reg u256 Do, reg u256 Du) --> reg ptr u256[25], reg ptr u256[25], reg u256, reg u256, reg u256, reg u256, reg u256 -{ - reg u256 Bga, Bge, Bgi, Bgo, Bgu; - reg u256 t256; - - t256 = A_4x[bo]; - t256 ^= Do; - A_4x[bo] = t256; - Bga = __rol_4u64(t256, 28); - - t256 = A_4x[gu]; - t256 ^= Du; - A_4x[gu] = t256; - Bge = __rol_4u64(t256, 20); - - t256 = A_4x[ka]; - t256 ^= Da; - A_4x[ka] = t256; - Bgi = __rol_4u64(t256, 3); - - // E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)) - t256 = #VPANDN_256(Bge, Bgi); - t256 ^= Bga; - E_4x[ga] = t256; - - Ca ^= t256; - - t256 = A_4x[me]; - t256 ^= De; - A_4x[me] = t256; - Bgo = __rol_4u64(t256, 45); - - // E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)) - t256 = #VPANDN_256(Bgi, Bgo); - t256 ^= Bge; - E_4x[ge] = t256; - - Ce ^= t256; - - t256 = A_4x[si]; - t256 ^= Di; - A_4x[si] = t256; - Bgu = __rol_4u64(t256, 61); - - // E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)) - t256 = #VPANDN_256(Bgo, Bgu); - t256 ^= Bgi; - E_4x[gi] = t256; - - Ci ^= t256; - - // E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); - t256 = #VPANDN_256(Bgu, Bga); - t256 ^= Bgo; - E_4x[go] = t256; - - Co ^= t256; - - // E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); - t256 = #VPANDN_256(Bga, Bge); - t256 ^= Bgu; - E_4x[gu] = t256; - - Cu ^= t256; - - return A_4x, E_4x, Ca, Ce, Ci, Co, Cu; -} - -inline fn __fourth_even( -reg ptr u256[25] A_4x, reg ptr u256[25] E_4x, -reg u256 Ca, reg u256 Ce, reg u256 Ci, reg u256 Co, reg u256 Cu, -reg u256 Da, reg u256 De, reg u256 Di, reg u256 Do, reg u256 Du) --> reg ptr u256[25], reg ptr u256[25], reg u256, reg u256, reg u256, reg u256, reg u256 -{ - reg u256 Bka, Bke, Bki, Bko, Bku; - reg u256 t256; - - t256 = A_4x[be]; - t256 ^= De; - A_4x[be] = t256; - Bka = __rol_4u64(t256, 1); - - t256 = A_4x[gi]; - t256 ^= Di; - A_4x[gi] = t256; - Bke = __rol_4u64(t256, 6); - - t256 = A_4x[ko]; - t256 ^= Do; - A_4x[ko] = t256; - Bki = __rol_4u64(t256, 25); - - // E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); - t256 = #VPANDN_256(Bke, Bki); - t256 ^= Bka; - E_4x[ka] = t256; - - Ca ^= t256; - - t256 = A_4x[mu]; - t256 ^= Du; - A_4x[mu] = t256; - Bko = __rol_4u64_rho8(t256); - - // E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); - t256 = #VPANDN_256(Bki, Bko); - t256 ^= Bke; - E_4x[ke] = t256; - - Ce ^= t256; - - t256 = A_4x[sa]; - t256 ^= Da; - A_4x[sa] = t256; - Bku = __rol_4u64(t256, 18); - - // E##ki = XOR256(Bki, ANDnu256(Bko, Bku)) - t256 = #VPANDN_256(Bko, Bku); - t256 ^= Bki; - E_4x[ki] = t256; - - Ci ^= t256; - - // E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); - t256 = #VPANDN_256(Bku, Bka); - t256 ^= Bko; - E_4x[ko] = t256; - - Co ^= t256; - - // E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); - t256 = #VPANDN_256(Bka, Bke); - t256 ^= Bku; - E_4x[ku] = t256; - - Cu ^= t256; - - return A_4x, E_4x, Ca, Ce, Ci, Co, Cu; -} - -inline fn __fifth_even( -reg ptr u256[25] A_4x, reg ptr u256[25] E_4x, -reg u256 Ca, reg u256 Ce, reg u256 Ci, reg u256 Co, reg u256 Cu, -reg u256 Da, reg u256 De, reg u256 Di, reg u256 Do, reg u256 Du) --> reg ptr u256[25], reg ptr u256[25], reg u256, reg u256, reg u256, reg u256, reg u256 -{ - reg u256 Bma, Bme, Bmi, Bmo, Bmu; - reg u256 t256; - - t256 = A_4x[bu]; - t256 ^= Du; - A_4x[bu] = t256; - Bma = __rol_4u64(t256, 27); - - t256 = A_4x[ga]; - t256 ^= Da; - A_4x[ga] = t256; - Bme = __rol_4u64(t256, 36); - - t256 = A_4x[ke]; - t256 ^= De; - A_4x[ke] = t256; - Bmi = __rol_4u64(t256, 10); - - // E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); - t256 = #VPANDN_256(Bme, Bmi); - t256 ^= Bma; - E_4x[ma] = t256; - - Ca ^= t256; - - t256 = A_4x[mi]; - t256 ^= Di; - A_4x[mi] = t256; - Bmo = __rol_4u64(t256, 15); - - // E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); - t256 = #VPANDN_256(Bmi, Bmo); - t256 ^= Bme; - E_4x[me] = t256; - - Ce ^= t256; - - t256 = A_4x[so]; - t256 ^= Do; - A_4x[so] = t256; - Bmu = __rol_4u64_rho56(t256); - - // E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); - t256 = #VPANDN_256(Bmo, Bmu); - t256 ^= Bmi; - E_4x[mi] = t256; - - Ci ^= t256; - - // E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); - t256 = #VPANDN_256(Bmu, Bma); - t256 ^= Bmo; - E_4x[mo] = t256; - - Co ^= t256; - - // E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); - t256 = #VPANDN_256(Bma, Bme); - t256 ^= Bmu; - E_4x[mu] = t256; - - Cu ^= t256; - - return A_4x, E_4x, Ca, Ce, Ci, Co, Cu; -} - -inline fn __sixth_even( -reg ptr u256[25] A_4x, reg ptr u256[25] E_4x, -reg u256 Ca, reg u256 Ce, reg u256 Ci, reg u256 Co, reg u256 Cu, -reg u256 Da, reg u256 De, reg u256 Di, reg u256 Do, reg u256 Du) --> reg ptr u256[25], reg ptr u256[25], reg u256, reg u256, reg u256, reg u256, reg u256 -{ - reg u256 Bsa, Bse, Bsi, Bso, Bsu; - reg u256 t256; - - t256 = A_4x[bi]; - t256 ^= Di; - A_4x[bi] = t256; - Bsa = __rol_4u64(t256, 62); - - t256 = A_4x[go]; - t256 ^= Do; - A_4x[go] = t256; - Bse = __rol_4u64(t256, 55); - - t256 = A_4x[ku]; - t256 ^= Du; - A_4x[ku] = t256; - Bsi = __rol_4u64(t256, 39); - - // E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); - t256 = #VPANDN_256(Bse, Bsi); - t256 ^= Bsa; - E_4x[sa] = t256; - - Ca ^= t256; - - t256 = A_4x[ma]; - t256 ^= Da; - A_4x[ma] = t256; - Bso = __rol_4u64(t256, 41); - - // E##se = XOR256(Bse, ANDnu256(Bsi, Bso)) - t256 = #VPANDN_256(Bsi, Bso); - t256 ^= Bse; - E_4x[se] = t256; - - Ce ^= t256; - - t256 = A_4x[se]; - t256 ^= De; - A_4x[se] = t256; - Bsu = __rol_4u64(t256, 2); - - // E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); - t256 = #VPANDN_256(Bso, Bsu); - t256 ^= Bsi; - E_4x[si] = t256; - - Ci ^= t256; - - // E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); - t256 = #VPANDN_256(Bsu, Bsa); - t256 ^= Bso; - E_4x[so] = t256; - - Co ^= t256; - - // E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); - t256 = #VPANDN_256(Bsa, Bse); - t256 ^= Bsu; - E_4x[su] = t256; - - Cu ^= t256; - - return A_4x, E_4x, Ca, Ce, Ci, Co, Cu; -} - -inline fn __second_odd( -reg ptr u256[25] A_4x, reg ptr u256[25] E_4x, inline int index, -reg u256 Ca, reg u256 Ce, reg u256 Ci, reg u256 Co, reg u256 Cu, -reg u256 Da, reg u256 De, reg u256 Di, reg u256 Do, reg u256 Du) --> reg ptr u256[25], reg ptr u256[25], reg u256, reg u256, reg u256, reg u256, reg u256 -{ - reg u256 Bba, Bbe, Bbi, Bbo, Bbu; - reg u256 t256; - - t256 = A_4x[ba]; - t256 ^= Da; - A_4x[ba] = t256; - Bba = t256; - - t256 = A_4x[ge]; - t256 ^= De; - A_4x[ge] = t256; - Bbe = __rol_4u64(t256, 44); - - t256 = A_4x[ki]; - t256 ^= Di; - A_4x[ki] = t256; - Bbi = __rol_4u64(t256, 43); - - // E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); - t256 = #VPANDN_256(Bbe, Bbi); - t256 ^= Bba; - t256 ^= KeccakF1600RoundConstants[index]; - E_4x[ba] = t256; - - Ca = t256; - - t256 = A_4x[mo]; - t256 ^= Do; - A_4x[mo] = t256; - Bbo = __rol_4u64(t256, 21); - - // E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); - t256 = #VPANDN_256(Bbi, Bbo); - t256 ^= Bbe; - E_4x[be] = t256; - - Ce = t256; - - t256 = A_4x[su]; - t256 ^= Du; - A_4x[su] = t256; - Bbu = __rol_4u64(t256, 14); - - // E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); - t256 = #VPANDN_256(Bbo, Bbu); - t256 ^= Bbi; - E_4x[bi] = t256; - - Ci = t256; - - // E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); - t256 = #VPANDN_256(Bbu, Bba); - t256 ^= Bbo; - E_4x[bo] = t256; - - Co = t256; - - // E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); - t256 = #VPANDN_256(Bba, Bbe); - t256 ^= Bbu; - E_4x[bu] = t256; - - Cu = t256; - - return A_4x, E_4x, Ca, Ce, Ci, Co, Cu; -} - -inline fn __third_odd( -reg ptr u256[25] A_4x, reg ptr u256[25] E_4x, -reg u256 Ca, reg u256 Ce, reg u256 Ci, reg u256 Co, reg u256 Cu, -reg u256 Da, reg u256 De, reg u256 Di, reg u256 Do, reg u256 Du) --> reg ptr u256[25], reg ptr u256[25], reg u256, reg u256, reg u256, reg u256, reg u256 -{ - reg u256 Bga, Bge, Bgi, Bgo, Bgu; - reg u256 t256; - - t256 = A_4x[bo]; - t256 ^= Do; - A_4x[bo] = t256; - Bga = __rol_4u64(t256, 28); - - t256 = A_4x[gu]; - t256 ^= Du; - A_4x[gu] = t256; - Bge = __rol_4u64(t256, 20); - - t256 = A_4x[ka]; - t256 ^= Da; - A_4x[ka] = t256; - Bgi = __rol_4u64(t256, 3); - - // E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)) - t256 = #VPANDN_256(Bge, Bgi); - t256 ^= Bga; - E_4x[ga] = t256; - - Ca ^= t256; - - t256 = A_4x[me]; - t256 ^= De; - A_4x[me] = t256; - Bgo = __rol_4u64(t256, 45); - - // E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)) - t256 = #VPANDN_256(Bgi, Bgo); - t256 ^= Bge; - E_4x[ge] = t256; - - Ce ^= t256; - - t256 = A_4x[si]; - t256 ^= Di; - A_4x[si] = t256; - Bgu = __rol_4u64(t256, 61); - - // E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)) - t256 = #VPANDN_256(Bgo, Bgu); - t256 ^= Bgi; - E_4x[gi] = t256; - - Ci ^= t256; - - // E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); - t256 = #VPANDN_256(Bgu, Bga); - t256 ^= Bgo; - E_4x[go] = t256; - - Co ^= t256; - - // E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); - t256 = #VPANDN_256(Bga, Bge); - t256 ^= Bgu; - E_4x[gu] = t256; - - Cu ^= t256; - - return A_4x, E_4x, Ca, Ce, Ci, Co, Cu; -} - -inline fn __fourth_odd( -reg ptr u256[25] A_4x, reg ptr u256[25] E_4x, -reg u256 Ca, reg u256 Ce, reg u256 Ci, reg u256 Co, reg u256 Cu, -reg u256 Da, reg u256 De, reg u256 Di, reg u256 Do, reg u256 Du) --> reg ptr u256[25], reg ptr u256[25], reg u256, reg u256, reg u256, reg u256, reg u256 -{ - reg u256 Bka, Bke, Bki, Bko, Bku; - reg u256 t256; - - t256 = A_4x[be]; - t256 ^= De; - A_4x[be] = t256; - Bka = __rol_4u64(t256, 1); - - t256 = A_4x[gi]; - t256 ^= Di; - A_4x[gi] = t256; - Bke = __rol_4u64(t256, 6); - - t256 = A_4x[ko]; - t256 ^= Do; - A_4x[ko] = t256; - Bki = __rol_4u64(t256, 25); - - // E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); - t256 = #VPANDN_256(Bke, Bki); - t256 ^= Bka; - E_4x[ka] = t256; - - Ca ^= t256; - - t256 = A_4x[mu]; - t256 ^= Du; - A_4x[mu] = t256; - Bko = __rol_4u64_rho8(t256); - - // E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); - t256 = #VPANDN_256(Bki, Bko); - t256 ^= Bke; - E_4x[ke] = t256; - - Ce ^= t256; - - t256 = A_4x[sa]; - t256 ^= Da; - A_4x[sa] = t256; - Bku = __rol_4u64(t256, 18); - - // E##ki = XOR256(Bki, ANDnu256(Bko, Bku)) - t256 = #VPANDN_256(Bko, Bku); - t256 ^= Bki; - E_4x[ki] = t256; - - Ci ^= t256; - - // E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); - t256 = #VPANDN_256(Bku, Bka); - t256 ^= Bko; - E_4x[ko] = t256; - - Co ^= t256; - - // E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); - t256 = #VPANDN_256(Bka, Bke); - t256 ^= Bku; - E_4x[ku] = t256; - - Cu ^= t256; - - return A_4x, E_4x, Ca, Ce, Ci, Co, Cu; -} - -inline fn __fifth_odd( -reg ptr u256[25] A_4x, reg ptr u256[25] E_4x, -reg u256 Ca, reg u256 Ce, reg u256 Ci, reg u256 Co, reg u256 Cu, -reg u256 Da, reg u256 De, reg u256 Di, reg u256 Do, reg u256 Du) --> reg ptr u256[25], reg ptr u256[25], reg u256, reg u256, reg u256, reg u256, reg u256 -{ - reg u256 Bma, Bme, Bmi, Bmo, Bmu; - reg u256 t256; - - t256 = A_4x[bu]; - t256 ^= Du; - A_4x[bu] = t256; - Bma = __rol_4u64(t256, 27); - - t256 = A_4x[ga]; - t256 ^= Da; - A_4x[ga] = t256; - Bme = __rol_4u64(t256, 36); - - t256 = A_4x[ke]; - t256 ^= De; - A_4x[ke] = t256; - Bmi = __rol_4u64(t256, 10); - - // E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); - t256 = #VPANDN_256(Bme, Bmi); - t256 ^= Bma; - E_4x[ma] = t256; - - Ca ^= t256; - - t256 = A_4x[mi]; - t256 ^= Di; - A_4x[mi] = t256; - Bmo = __rol_4u64(t256, 15); - - // E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); - t256 = #VPANDN_256(Bmi, Bmo); - t256 ^= Bme; - E_4x[me] = t256; - - Ce ^= t256; - - t256 = A_4x[so]; - t256 ^= Do; - A_4x[so] = t256; - Bmu = __rol_4u64_rho56(t256); - - // E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); - t256 = #VPANDN_256(Bmo, Bmu); - t256 ^= Bmi; - E_4x[mi] = t256; - - Ci ^= t256; - - // E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); - t256 = #VPANDN_256(Bmu, Bma); - t256 ^= Bmo; - E_4x[mo] = t256; - - Co ^= t256; - - // E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); - t256 = #VPANDN_256(Bma, Bme); - t256 ^= Bmu; - E_4x[mu] = t256; - - Cu ^= t256; - - return A_4x, E_4x, Ca, Ce, Ci, Co, Cu; -} - -inline fn __sixth_odd( -reg ptr u256[25] A_4x, reg ptr u256[25] E_4x, -reg u256 Ca, reg u256 Ce, reg u256 Ci, reg u256 Co, reg u256 Cu, -reg u256 Da, reg u256 De, reg u256 Di, reg u256 Do, reg u256 Du) --> reg ptr u256[25], reg ptr u256[25], reg u256, reg u256, reg u256, reg u256, reg u256 -{ - reg u256 Bsa, Bse, Bsi, Bso, Bsu; - reg u256 t256; - - t256 = A_4x[bi]; - t256 ^= Di; - A_4x[bi] = t256; - Bsa = __rol_4u64(t256, 62); - - t256 = A_4x[go]; - t256 ^= Do; - A_4x[go] = t256; - Bse = __rol_4u64(t256, 55); - - t256 = A_4x[ku]; - t256 ^= Du; - A_4x[ku] = t256; - Bsi = __rol_4u64(t256, 39); - - // E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); - t256 = #VPANDN_256(Bse, Bsi); - t256 ^= Bsa; - E_4x[sa] = t256; - - Ca ^= t256; - - t256 = A_4x[ma]; - t256 ^= Da; - A_4x[ma] = t256; - Bso = __rol_4u64(t256, 41); - - // E##se = XOR256(Bse, ANDnu256(Bsi, Bso)) - t256 = #VPANDN_256(Bsi, Bso); - t256 ^= Bse; - E_4x[se] = t256; - - Ce ^= t256; - - t256 = A_4x[se]; - t256 ^= De; - A_4x[se] = t256; - Bsu = __rol_4u64(t256, 2); - - // E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); - t256 = #VPANDN_256(Bso, Bsu); - t256 ^= Bsi; - E_4x[si] = t256; - - Ci ^= t256; - - // E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); - t256 = #VPANDN_256(Bsu, Bsa); - t256 ^= Bso; - E_4x[so] = t256; - - Co ^= t256; - - // E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); - t256 = #VPANDN_256(Bsa, Bse); - t256 ^= Bsu; - E_4x[su] = t256; - - Cu ^= t256; - - return A_4x, E_4x, Ca, Ce, Ci, Co, Cu; -} - -inline fn __second_last( -reg ptr u256[25] A_4x, reg ptr u256[25] E_4x, inline int index, -reg u256 Da, reg u256 De, reg u256 Di, reg u256 Do, reg u256 Du) --> reg ptr u256[25], reg ptr u256[25] -{ - reg u256 Bba, Bbe, Bbi, Bbo, Bbu; - reg u256 t256; - - t256 = A_4x[ba]; - t256 ^= Da; - A_4x[ba] = t256; - Bba = t256; - - t256 = A_4x[ge]; - t256 ^= De; - A_4x[ge] = t256; - Bbe = __rol_4u64(t256, 44); - - t256 = A_4x[ki]; - t256 ^= Di; - A_4x[ki] = t256; - Bbi = __rol_4u64(t256, 43); - - // E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); - t256 = #VPANDN_256(Bbe, Bbi); - t256 ^= Bba; - t256 ^= KeccakF1600RoundConstants[index]; - E_4x[ba] = t256; - - t256 = A_4x[mo]; - t256 ^= Do; - A_4x[mo] = t256; - Bbo = __rol_4u64(t256, 21); - - // E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); - t256 = #VPANDN_256(Bbi, Bbo); - t256 ^= Bbe; - E_4x[be] = t256; - - t256 = A_4x[su]; - t256 ^= Du; - A_4x[su] = t256; - Bbu = __rol_4u64(t256, 14); - - // E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); - t256 = #VPANDN_256(Bbo, Bbu); - t256 ^= Bbi; - E_4x[bi] = t256; - - // E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); - t256 = #VPANDN_256(Bbu, Bba); - t256 ^= Bbo; - E_4x[bo] = t256; - - // E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); - t256 = #VPANDN_256(Bba, Bbe); - t256 ^= Bbu; - E_4x[bu] = t256; - - return A_4x, E_4x; -} - -inline fn __third_last( -reg ptr u256[25] A_4x, reg ptr u256[25] E_4x, -reg u256 Da, reg u256 De, reg u256 Di, reg u256 Do, reg u256 Du) --> reg ptr u256[25], reg ptr u256[25] -{ - reg u256 Bga, Bge, Bgi, Bgo, Bgu; - reg u256 t256; - - t256 = A_4x[bo]; - t256 ^= Do; - A_4x[bo] = t256; - Bga = __rol_4u64(t256, 28); - - t256 = A_4x[gu]; - t256 ^= Du; - A_4x[gu] = t256; - Bge = __rol_4u64(t256, 20); - - t256 = A_4x[ka]; - t256 ^= Da; - A_4x[ka] = t256; - Bgi = __rol_4u64(t256, 3); - - // E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)) - t256 = #VPANDN_256(Bge, Bgi); - t256 ^= Bga; - E_4x[ga] = t256; - - t256 = A_4x[me]; - t256 ^= De; - A_4x[me] = t256; - Bgo = __rol_4u64(t256, 45); - - // E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)) - t256 = #VPANDN_256(Bgi, Bgo); - t256 ^= Bge; - E_4x[ge] = t256; - - t256 = A_4x[si]; - t256 ^= Di; - A_4x[si] = t256; - Bgu = __rol_4u64(t256, 61); - - // E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)) - t256 = #VPANDN_256(Bgo, Bgu); - t256 ^= Bgi; - E_4x[gi] = t256; - - // E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); - t256 = #VPANDN_256(Bgu, Bga); - t256 ^= Bgo; - E_4x[go] = t256; - - // E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); - t256 = #VPANDN_256(Bga, Bge); - t256 ^= Bgu; - E_4x[gu] = t256; - - return A_4x, E_4x; -} - -inline fn __fourth_last( -reg ptr u256[25] A_4x, reg ptr u256[25] E_4x, -reg u256 Da, reg u256 De, reg u256 Di, reg u256 Do, reg u256 Du) --> reg ptr u256[25], reg ptr u256[25] -{ - reg u256 Bka, Bke, Bki, Bko, Bku; - reg u256 t256; - - t256 = A_4x[be]; - t256 ^= De; - A_4x[be] = t256; - Bka = __rol_4u64(t256, 1); - - t256 = A_4x[gi]; - t256 ^= Di; - A_4x[gi] = t256; - Bke = __rol_4u64(t256, 6); - - t256 = A_4x[ko]; - t256 ^= Do; - A_4x[ko] = t256; - Bki = __rol_4u64(t256, 25); - - // E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); - t256 = #VPANDN_256(Bke, Bki); - t256 ^= Bka; - E_4x[ka] = t256; - - t256 = A_4x[mu]; - t256 ^= Du; - A_4x[mu] = t256; - Bko = __rol_4u64_rho8(t256); - - // E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); - t256 = #VPANDN_256(Bki, Bko); - t256 ^= Bke; - E_4x[ke] = t256; - - t256 = A_4x[sa]; - t256 ^= Da; - A_4x[sa] = t256; - Bku = __rol_4u64(t256, 18); - - // E##ki = XOR256(Bki, ANDnu256(Bko, Bku)) - t256 = #VPANDN_256(Bko, Bku); - t256 ^= Bki; - E_4x[ki] = t256; - - // E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); - t256 = #VPANDN_256(Bku, Bka); - t256 ^= Bko; - E_4x[ko] = t256; - - // E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); - t256 = #VPANDN_256(Bka, Bke); - t256 ^= Bku; - E_4x[ku] = t256; - - return A_4x, E_4x; -} - -inline fn __fifth_last( -reg ptr u256[25] A_4x, reg ptr u256[25] E_4x, -reg u256 Da, reg u256 De, reg u256 Di, reg u256 Do, reg u256 Du) --> reg ptr u256[25], reg ptr u256[25] -{ - reg u256 Bma, Bme, Bmi, Bmo, Bmu; - reg u256 t256; - - t256 = A_4x[bu]; - t256 ^= Du; - A_4x[bu] = t256; - Bma = __rol_4u64(t256, 27); - - t256 = A_4x[ga]; - t256 ^= Da; - A_4x[ga] = t256; - Bme = __rol_4u64(t256, 36); - - t256 = A_4x[ke]; - t256 ^= De; - A_4x[ke] = t256; - Bmi = __rol_4u64(t256, 10); - - // E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); - t256 = #VPANDN_256(Bme, Bmi); - t256 ^= Bma; - E_4x[ma] = t256; - - t256 = A_4x[mi]; - t256 ^= Di; - A_4x[mi] = t256; - Bmo = __rol_4u64(t256, 15); - - // E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); - t256 = #VPANDN_256(Bmi, Bmo); - t256 ^= Bme; - E_4x[me] = t256; - - t256 = A_4x[so]; - t256 ^= Do; - A_4x[so] = t256; - Bmu = __rol_4u64_rho56(t256); - - // E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); - t256 = #VPANDN_256(Bmo, Bmu); - t256 ^= Bmi; - E_4x[mi] = t256; - - // E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); - t256 = #VPANDN_256(Bmu, Bma); - t256 ^= Bmo; - E_4x[mo] = t256; - - // E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); - t256 = #VPANDN_256(Bma, Bme); - t256 ^= Bmu; - E_4x[mu] = t256; - - return A_4x, E_4x; -} - -inline fn __sixth_last( -reg ptr u256[25] A_4x, reg ptr u256[25] E_4x, -reg u256 Da, reg u256 De, reg u256 Di, reg u256 Do, reg u256 Du) --> reg ptr u256[25], reg ptr u256[25] -{ - reg u256 Bsa, Bse, Bsi, Bso, Bsu; - reg u256 t256; - - t256 = A_4x[bi]; - t256 ^= Di; - A_4x[bi] = t256; - Bsa = __rol_4u64(t256, 62); - - t256 = A_4x[go]; - t256 ^= Do; - A_4x[go] = t256; - Bse = __rol_4u64(t256, 55); - - t256 = A_4x[ku]; - t256 ^= Du; - A_4x[ku] = t256; - Bsi = __rol_4u64(t256, 39); - - // E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); - t256 = #VPANDN_256(Bse, Bsi); - t256 ^= Bsa; - E_4x[sa] = t256; - - t256 = A_4x[ma]; - t256 ^= Da; - A_4x[ma] = t256; - Bso = __rol_4u64(t256, 41); - - // E##se = XOR256(Bse, ANDnu256(Bsi, Bso)) - t256 = #VPANDN_256(Bsi, Bso); - t256 ^= Bse; - E_4x[se] = t256; - - t256 = A_4x[se]; - t256 ^= De; - A_4x[se] = t256; - Bsu = __rol_4u64(t256, 2); - - // E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); - t256 = #VPANDN_256(Bso, Bsu); - t256 ^= Bsi; - E_4x[si] = t256; - - // E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); - t256 = #VPANDN_256(Bsu, Bsa); - t256 ^= Bso; - E_4x[so] = t256; - - // E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); - t256 = #VPANDN_256(Bsa, Bse); - t256 ^= Bsu; - E_4x[su] = t256; - - return A_4x, E_4x; -} - -inline fn __theta_rho_pi_chi_iota_prepare_theta_even( -reg ptr u256[25] A_4x, reg ptr u256[25] E_4x, inline int index, -reg u256 Ca, reg u256 Ce, reg u256 Ci, reg u256 Co, reg u256 Cu) --> reg ptr u256[25], reg ptr u256[25], reg u256, reg u256, reg u256, reg u256, reg u256 -{ - reg u256 Da, De, Di, Do, Du; - - Da, De, Di, Do, Du = __first(Ca, Ce, Ci, Co, Cu); - - A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __second_even(A_4x, E_4x, index, Ca, Ce, Ci, Co, Cu, Da, De, Di, Do, Du); - - A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __third_even(A_4x, E_4x, Ca, Ce, Ci, Co, Cu, Da, De, Di, Do, Du); - - A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __fourth_even(A_4x, E_4x, Ca, Ce, Ci, Co, Cu, Da, De, Di, Do, Du); - - A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __fifth_even(A_4x, E_4x, Ca, Ce, Ci, Co, Cu, Da, De, Di, Do, Du); - - A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __sixth_even(A_4x, E_4x, Ca, Ce, Ci, Co, Cu, Da, De, Di, Do, Du); - - return A_4x, E_4x, Ca, Ce, Ci, Co, Cu; -} - -inline fn __theta_rho_pi_chi_iota_prepare_theta_odd( -reg ptr u256[25] A_4x, reg ptr u256[25] E_4x, inline int index, -reg u256 Ca, reg u256 Ce, reg u256 Ci, reg u256 Co, reg u256 Cu) --> reg ptr u256[25], reg ptr u256[25], reg u256, reg u256, reg u256, reg u256, reg u256 -{ - reg u256 Da, De, Di, Do, Du; - - Da, De, Di, Do, Du = __first(Ca, Ce, Ci, Co, Cu); - - A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __second_odd(A_4x, E_4x, index, Ca, Ce, Ci, Co, Cu, Da, De, Di, Do, Du); - - A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __third_odd(A_4x, E_4x, Ca, Ce, Ci, Co, Cu, Da, De, Di, Do, Du); - - A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __fourth_odd(A_4x, E_4x, Ca, Ce, Ci, Co, Cu, Da, De, Di, Do, Du); - - A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __fifth_odd(A_4x, E_4x, Ca, Ce, Ci, Co, Cu, Da, De, Di, Do, Du); - - A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __sixth_odd(A_4x, E_4x, Ca, Ce, Ci, Co, Cu, Da, De, Di, Do, Du); - - return A_4x, E_4x, Ca, Ce, Ci, Co, Cu; -} - -inline fn __theta_rho_pi_chi_iota( -reg ptr u256[25] A_4x, reg ptr u256[25] E_4x, inline int index, -reg u256 Ca, reg u256 Ce, reg u256 Ci, reg u256 Co, reg u256 Cu) --> reg ptr u256[25], reg ptr u256[25] -{ - reg u256 Da, De, Di, Do, Du; - - Da, De, Di, Do, Du = __first(Ca, Ce, Ci, Co, Cu); - - A_4x, E_4x = __second_last(A_4x, E_4x, index, Da, De, Di, Do, Du); - - A_4x, E_4x = __third_last(A_4x, E_4x, Da, De, Di, Do, Du); - - A_4x, E_4x = __fourth_last(A_4x, E_4x, Da, De, Di, Do, Du); - - A_4x, E_4x = __fifth_last(A_4x, E_4x, Da, De, Di, Do, Du); - - A_4x, E_4x = __sixth_last(A_4x, E_4x, Da, De, Di, Do, Du); - - return A_4x, E_4x; -} - -fn _KeccakF1600_StatePermute4x(reg ptr u256[25] A_4x) -> reg ptr u256[25] -{ - reg u256 Ca, Ce, Ci, Co, Cu; - - stack u256[25] E_4x; - - /** Rounds24 **/ - Ca, Ce, Ci, Co, Cu = __prepare_theta(A_4x); - A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_even(A_4x, E_4x, 0, Ca, Ce, Ci, Co, Cu); - E_4x, A_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_odd(E_4x, A_4x, 1, Ca, Ce, Ci, Co, Cu); - A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_even(A_4x, E_4x, 2, Ca, Ce, Ci, Co, Cu); - E_4x, A_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_odd(E_4x, A_4x, 3, Ca, Ce, Ci, Co, Cu); - A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_even(A_4x, E_4x, 4, Ca, Ce, Ci, Co, Cu); - E_4x, A_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_odd(E_4x, A_4x, 5, Ca, Ce, Ci, Co, Cu); - A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_even(A_4x, E_4x, 6, Ca, Ce, Ci, Co, Cu); - E_4x, A_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_odd(E_4x, A_4x, 7, Ca, Ce, Ci, Co, Cu); - A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_even(A_4x, E_4x, 8, Ca, Ce, Ci, Co, Cu); - E_4x, A_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_odd(E_4x, A_4x, 9, Ca, Ce, Ci, Co, Cu); - A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_even(A_4x, E_4x, 10, Ca, Ce, Ci, Co, Cu); - E_4x, A_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_odd(E_4x, A_4x, 11, Ca, Ce, Ci, Co, Cu); - A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_even(A_4x, E_4x, 12, Ca, Ce, Ci, Co, Cu); - E_4x, A_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_odd(E_4x, A_4x, 13, Ca, Ce, Ci, Co, Cu); - A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_even(A_4x, E_4x, 14, Ca, Ce, Ci, Co, Cu); - E_4x, A_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_odd(E_4x, A_4x, 15, Ca, Ce, Ci, Co, Cu); - A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_even(A_4x, E_4x, 16, Ca, Ce, Ci, Co, Cu); - E_4x, A_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_odd(E_4x, A_4x, 17, Ca, Ce, Ci, Co, Cu); - A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_even(A_4x, E_4x, 18, Ca, Ce, Ci, Co, Cu); - E_4x, A_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_odd(E_4x, A_4x, 19, Ca, Ce, Ci, Co, Cu); - A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_even(A_4x, E_4x, 20, Ca, Ce, Ci, Co, Cu); - E_4x, A_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_odd(E_4x, A_4x, 21, Ca, Ce, Ci, Co, Cu); - A_4x, E_4x, Ca, Ce, Ci, Co, Cu = __theta_rho_pi_chi_iota_prepare_theta_even(A_4x, E_4x, 22, Ca, Ce, Ci, Co, Cu); - E_4x, A_4x = __theta_rho_pi_chi_iota(E_4x, A_4x, 23, Ca, Ce, Ci, Co, Cu); - - - return A_4x; -} - - -fn _shake128_absorb4x_34(reg ptr u256[25] s, reg ptr u8[34] m0 m1 m2 m3) -> reg ptr u256[25] -{ - inline int i; - reg u256 t0 t1; - reg u16 t16; - reg u64 t64; - - for i = 0 to 25 - { - t0 = #set0_256(); - s[i] = t0; - } - - for i = 0 to 4 - { - t64 = m0[u64 i]; - s[u64 4 * i] ^= t64; - t64 = m1[u64 i]; - s[u64 4 * i + 1] ^= t64; - t64 = m2[u64 i]; - s[u64 4 * i + 2] ^= t64; - t64 = m3[u64 i]; - s[u64 4 * i + 3] ^= t64; - } - - t16 = m0.[u16 32]; - s[u16 64] ^= t16; - s[u8 130] ^= 0x1F; - - t16 = m1.[u16 32]; - s[u16 68] ^= t16; - s[u8 138] ^= 0x1F; - - t16 = m2.[u16 32]; - s[u16 72] ^= t16; - s[u8 146] ^= 0x1F; - - t16 = m3.[u16 32]; - s[u16 76] ^= t16; - s[u8 154] ^= 0x1F; - - t0 = shake_sep[u256 0]; - t1 = s[SHAKE128_RATE / 8 - 1]; - t0 = t0 ^ t1; - s[SHAKE128_RATE / 8 - 1] = t0; - - return s; -} - - -inline -fn __shake128_squeezeblock4x(reg ptr u256[25] state, reg ptr u8[SHAKE128_RATE] h0 h1 h2 h3) -> reg ptr u256[25], reg ptr u8[SHAKE128_RATE], reg ptr u8[SHAKE128_RATE], reg ptr u8[SHAKE128_RATE], reg ptr u8[SHAKE128_RATE] -{ - reg u256 t256; - reg u128 t128; - inline int i; - - state = _KeccakF1600_StatePermute4x(state); - - for i = 0 to (SHAKE128_RATE / 8) { - t256 = state[i]; - t128 = (128u)t256; - h0[u64 i] = #VMOVLPD(t128); - h1[u64 i] = #VMOVHPD(t128); - t128 = #VEXTRACTI128(t256, 1); - h2[u64 i] = #VMOVLPD(t128); - h3[u64 i] = #VMOVHPD(t128); - } - - return state, h0, h1, h2, h3; -} - - -fn _shake256_absorb4x_33(reg ptr u256[25] s, reg ptr u8[33] m0 m1 m2 m3) -> reg ptr u256[25] -{ - inline int i; - reg u256 t0 t1; - reg u64 t64; - reg u8 t8; - - for i = 0 to 25 - { - t0 = #set0_256(); - s[i] = t0; - } - - for i = 0 to 4 - { - t64 = m0[u64 i]; - s[u64 4 * i] ^= t64; - t64 = m1[u64 i]; - s[u64 4 * i + 1] ^= t64; - t64 = m2[u64 i]; - s[u64 4 * i + 2] ^= t64; - t64 = m3[u64 i]; - s[u64 4 * i + 3] ^= t64; - } - - t8 = m0[32]; - s[u8 128] ^= t8; - s[u8 129] ^= 0x1F; - - t8 = m1[32]; - s[u8 136] ^= t8; - s[u8 137] ^= 0x1F; - - t8 = m2[32]; - s[u8 144] ^= t8; - s[u8 145] ^= 0x1F; - - t8 = m3[32]; - s[u8 152] ^= t8; - s[u8 153] ^= 0x1F; - - t0 = shake_sep[u256 0]; - t1 = s[SHAKE256_RATE / 8 - 1]; - t0 = t0 ^ t1; - s[SHAKE256_RATE / 8 - 1] = t0; - - return s; -} - - -inline -fn __shake256_squeezeblock4x(reg ptr u256[25] state, reg ptr u8[SHAKE256_RATE] h0 h1 h2 h3) -> reg ptr u256[25], reg ptr u8[SHAKE256_RATE], reg ptr u8[SHAKE256_RATE], reg ptr u8[SHAKE256_RATE], reg ptr u8[SHAKE256_RATE] -{ - reg u256 t256; - reg u128 t128; - inline int i; - - state = _KeccakF1600_StatePermute4x(state); - - for i = 0 to (SHAKE256_RATE / 8) { - t256 = state[i]; - t128 = (128u)t256; - h0[u64 i] = #VMOVLPD(t128); - h1[u64 i] = #VMOVHPD(t128); - t128 = #VEXTRACTI128(t256, 1); - h2[u64 i] = #VMOVLPD(t128); - h3[u64 i] = #VMOVHPD(t128); - } - - return state, h0, h1, h2, h3; -} - -inline -fn __shake256_squeezeblock4xTRANSITION(reg ptr u256[25] state, reg ptr u8[128] h0 h1 h2 h3) -> reg ptr u256[25], reg ptr u8[128], reg ptr u8[128], reg ptr u8[128], reg ptr u8[128] -{ - reg u256 t256; - reg u128 t128; - inline int i; - - state = _KeccakF1600_StatePermute4x(state); - - for i = 0 to (128 / 8) { - t256 = state[i]; - t128 = (128u)t256; - h0[u64 i] = #VMOVLPD(t128); - h1[u64 i] = #VMOVHPD(t128); - t128 = #VEXTRACTI128(t256, 1); - h2[u64 i] = #VMOVLPD(t128); - h3[u64 i] = #VMOVHPD(t128); - } - - return state, h0, h1, h2, h3; -} diff --git a/code/jasmin/mlkem_avx2/keccak_OLD/fips202_common.jinc b/code/jasmin/mlkem_avx2/keccak_OLD/fips202_common.jinc deleted file mode 100644 index 0ed82a08..00000000 --- a/code/jasmin/mlkem_avx2/keccak_OLD/fips202_common.jinc +++ /dev/null @@ -1,6 +0,0 @@ -param int SHAKE128_RATE = 168; -param int SHAKE256_RATE = 136; -param int SHA3_256_RATE = 136; -param int SHA3_512_RATE = 72; - -u64[4] shake_sep = {9223372036854775808, 9223372036854775808, 9223372036854775808, 9223372036854775808}; diff --git a/code/jasmin/mlkem_avx2/keccak_OLD/gen_matrix_old.jinc b/code/jasmin/mlkem_avx2/keccak_OLD/gen_matrix_old.jinc deleted file mode 100644 index ce52e6c2..00000000 --- a/code/jasmin/mlkem_avx2/keccak_OLD/gen_matrix_old.jinc +++ /dev/null @@ -1,129 +0,0 @@ -require "params.jinc" -require "consts.jinc" -require "shuffle.jinc" -require "fips202.jinc" - -param int GENMATRIX_NBLOCKS = 3; -param int REJ_UNIFORM_BUFLEN = GENMATRIX_NBLOCKS * SHAKE128_RATE; - -inline -fn __rej_uniform_old(stack u16[MLKEM_N] rp, reg u64 offset, stack u8[REJ_UNIFORM_BUFLEN] buf, inline int buflen) -> reg u64, stack u16[MLKEM_N] -{ - reg u16 val0 val1; - reg u16 t; - reg u64 pos ctr; - reg u8 fl1 fl2; - reg bool cf zf b; - - ctr = offset; - pos = 0; - - _, cf, _, _, zf = #CMP_64(ctr, MLKEM_N - 1); - fl1 = #SETcc(cf || zf); //SETBE - - _, cf, _, _, zf = #CMP_64(pos, buflen - 3); - fl2 = #SETcc(cf || zf); //SETBE - - _, _, _, _, b = #TEST_8(fl1, fl2); - - while(!b) - { - val0 = (16u)buf[(int)pos]; - pos += 1; - - t = (16u)buf[(int)pos]; - val1 = t; - val1 >>= 4; - - t &= 0x0F; - t <<= 8; - val0 |= t; - pos += 1; - - t = (16u)buf[(int)pos]; - t <<= 4; - val1 |= t; - pos += 1; - - if(val0 < MLKEM_Q) - { - rp[(int)ctr] = val0; - ctr += 1; - } - - if(ctr < MLKEM_N) - { - if(val1 < MLKEM_Q) - { - rp[(int)ctr] = val1; - ctr += 1; - } - } - - _, cf, _, _, zf = #CMP_64(ctr, MLKEM_N - 1); - fl1 = #SETcc(cf || zf); //SETBE - - _, cf, _, _, zf = #CMP_64(pos, buflen - 3); - fl2 = #SETcc(cf || zf); //SETBE - - _, _, _, _, b = #TEST_8(fl1, fl2); - } - - return ctr, rp; -} - -inline -fn __gen_matrix_old(stack u8[MLKEM_SYMBYTES] seed, inline int transposed) -> stack u16[MLKEM_K*MLKEM_VECN] -{ - stack u8[34] extseed; - stack u8[REJ_UNIFORM_BUFLEN] buf; - stack u8[REJ_UNIFORM_BUFLEN] buf; - stack u8[REJ_UNIFORM_BUFLEN] buf; - stack u8[REJ_UNIFORM_BUFLEN] buf; - stack u64[25] state; - stack u16[MLKEM_K*MLKEM_VECN] rr; - - reg u64 t64; - stack u64 t64_s; - inline int i, j, k; - - for j = 0 to 4 - { - t64 = seed[u64 j]; - extseed[u64 j] = t64; - } - - for i = 0 to MLKEM_K - { - for j = 0 to MLKEM_K - { - if(transposed == 0) - { - extseed[MLKEM_SYMBYTES] = j; - extseed[MLKEM_SYMBYTES+1] = i; - } - else - { - extseed[MLKEM_SYMBYTES] = i; - extseed[MLKEM_SYMBYTES+1] = j; - } - - state = _shake128_absorb34(state, extseed); - - state, buf = __shake128_squeezenblocks(state, buf); - t64 = 0; - t64, rr[i*MLKEM_VECN+j*MLKEM_N:MLKEM_N] = __rej_uniform_old(rr[i*MLKEM_VECN+j*MLKEM_N:MLKEM_N], t64, buf, REJ_UNIFORM_BUFLEN); - - while (t64 < MLKEM_N) - { - t64_s = t64; - state, buf[0:SHAKE128_RATE] = _shake128_squeezeblock(state, buf[0:SHAKE128_RATE]); - t64 = t64_s; - t64, rr[i*MLKEM_VECN+j*MLKEM_N:MLKEM_N] = __rej_uniform_old(rr[i*MLKEM_VECN+j*MLKEM_N:MLKEM_N], t64, buf, SHAKE128_RATE); - } - rr[i*MLKEM_VECN+j*MLKEM_N:MLKEM_N] = _nttunpack(rr[i*MLKEM_VECN+j*MLKEM_N:MLKEM_N]); - } - } - - return rr; -} diff --git a/code/jasmin/mlkem_avx2/keccak_OLD/keccakf1600.jinc b/code/jasmin/mlkem_avx2/keccak_OLD/keccakf1600.jinc deleted file mode 100644 index 7e2b6869..00000000 --- a/code/jasmin/mlkem_avx2/keccak_OLD/keccakf1600.jinc +++ /dev/null @@ -1,169 +0,0 @@ -require "keccakf1600_generic.jinc" - -// C[x] = A[x,0] ^ A[x,1] ^ A[x,2] ^ A[x,3] ^ A[x,4] -inline fn keccakf1600_theta_sum(reg ptr u64[25] a) -> reg u64[5] -{ - inline int x y; - reg u64[5] c; - - // C[x] = A[x, 0] - for x=0 to 5 - { c[x] = a[x + 0]; } - - // C[x] ^= A[x,1] ^ A[x,2] ^ A[x,3] ^ A[x,4] - for y=1 to 5 - { for x=0 to 5 - { c[x] ^= a[x + y*5]; } - } - - return c; -} - -// D[x] = C[x-1] ^ ROT(C[x+1], 1) -inline fn keccakf1600_theta_rol(reg u64[5] c) -> reg u64[5] -{ - inline int x; - reg u64[5] d; - - for x = 0 to 5 - { // D[x] = C[x + 1] - d[x] = c[(x + 1) % 5]; - - // D[x] = ROT(D[x], 1) - _, _, d[x] = #ROL_64(d[x], 1); - - // D[x] ^= C[x-1] - d[x] ^= c[(x - 1 + 5) % 5]; - } - - return d; -} - -// B[x] = ROT( (A[x',y'] ^ D[x']), r[x',y'] ) with (x',y') = M^-1 (x,y) -// -// M = (0 1) M^-1 = (1 3) x' = 1x + 3y -// (2 3) (1 0) y' = 1x + 0y -// -inline fn keccakf1600_rol_sum( - reg ptr u64[25] a, - reg u64[5] d, - inline int y) - -> - reg u64[5] -{ - inline int r x x_ y_; - reg u64[5] b; - - for x = 0 to 5 - { - x_ = (x + 3*y) % 5; - y_ = x; - r = keccakf1600_rhotates(x_, y_); - - // B[x] = A[x',y'] - b[x] = a[x_ + y_*5]; - - // B[x] ^= D[x']; - b[x] ^= d[x_]; - - // B[x] = ROT( B[x], r[x',y'] ); - if(r != 0) - { _, _, b[x] = #ROL_64(b[x], r); } - - } - - return b; -} - -// E[x, y] = B[x] ^ ( (!B[x+1]) & B[x+2] ) -// -- when x and y are 0: E[0,0] ^= RC[i]; -inline fn keccakf1600_set_row( - reg ptr u64[25] e, - reg u64[5] b, - inline int y, - stack u64 s_rc) - -> - reg ptr u64[25] -{ - inline int x x1 x2; - reg u64 t; - - for x=0 to 5 - { - x1 = (x + 1) % 5; - x2 = (x + 2) % 5; - - t = !b[x1] & b[x2]; // bmi1 - //t = b[x1]; t = !t; t &= b[x2]; - - t ^= b[x]; - if( x==0 && y==0 ){ t ^= s_rc; } - e[x + y*5] = t; - } - - return e; -} - -inline fn keccakf1600_round( - reg ptr u64[25] e, - reg ptr u64[25] a, - reg u64 rc) - -> - reg ptr u64[25] -{ - inline int y; - reg u64[5] b c d; - stack u64 s_rc; - - s_rc = rc; - - c = keccakf1600_theta_sum(a); - d = keccakf1600_theta_rol(c); - - for y = 0 to 5 - { b = keccakf1600_rol_sum(a, d, y); - e = keccakf1600_set_row(e, b, y, s_rc); - } - - return e; -} - -inline fn __keccakf1600(reg ptr u64[25] a) -> reg ptr u64[25] -{ - reg ptr u64[24] RC; - stack u64[25] s_e; - reg ptr u64[25] e; - - reg u64 c rc; - - RC = KECCAK1600_RC; - e = s_e; - - c = 0; - while (c < KECCAK_ROUNDS - 1) - { - rc = RC[(int) c]; - e = keccakf1600_round(e, a, rc); - - rc = RC[(int) c + 1]; - a = keccakf1600_round(a, e, rc); - - c += 2; - } - - return a; -} - -fn _keccakf1600(reg ptr u64[25] a) -> reg ptr u64[25] -{ - a = __keccakf1600(a); - return a; -} - -inline fn _keccakf1600_(reg ptr u64[25] a) -> reg ptr u64[25] -{ - a = a; - a = _keccakf1600(a); - a = a; - return a; -} diff --git a/code/jasmin/mlkem_avx2/keccak_OLD/keccakf1600_4x_avx2_compact.jinc b/code/jasmin/mlkem_avx2/keccak_OLD/keccakf1600_4x_avx2_compact.jinc deleted file mode 100644 index 0f0f4f8a..00000000 --- a/code/jasmin/mlkem_avx2/keccak_OLD/keccakf1600_4x_avx2_compact.jinc +++ /dev/null @@ -1,331 +0,0 @@ - -require "keccakf1600_generic.jinc" - -u256[24] KECCAK1600_RC_AVX2 = -{ (4u64)[0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001], - (4u64)[0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082], - (4u64)[0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a], - (4u64)[0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000], - (4u64)[0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b], - (4u64)[0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001], - (4u64)[0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081], - (4u64)[0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009], - (4u64)[0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a], - (4u64)[0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088], - (4u64)[0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009], - (4u64)[0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a], - (4u64)[0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b], - (4u64)[0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b], - (4u64)[0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089], - (4u64)[0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003], - (4u64)[0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002], - (4u64)[0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080], - (4u64)[0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a], - (4u64)[0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a], - (4u64)[0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081], - (4u64)[0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080], - (4u64)[0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001], - (4u64)[0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008] -}; - -u256 ROL56 = 0x181F1E1D1C1B1A191017161514131211080F0E0D0C0B0A090007060504030201; -u256 ROL8 = 0x1E1D1C1B1A19181F16151413121110170E0D0C0B0A09080F0605040302010007; - -// C[x] = A[x,0] ^ A[x,1] ^ A[x,2] ^ A[x,3] ^ A[x,4] -inline fn keccakf1600_4x_theta_sum(reg ptr u256[25] a) -> reg u256[5] -{ - inline int x y; - reg u256[5] c; - - // C[x] = A[x, 0] - for x=0 to 5 - { c[x] = a[x + 0]; } - - // C[x] ^= A[x,1] ^ A[x,2] ^ A[x,3] ^ A[x,4] - for y=1 to 5 - { for x=0 to 5 - { c[x] ^= a[x + y*5]; } - } - - return c; -} - -inline fn keccakf1600_4x_rol(reg u256[5] a, inline int x r, reg u256 r8 r56) -> reg u256[5] -{ - reg u256 t; - - if(r == 8) - { a[x] = #VPSHUFB_256(a[x], r8); } - else { if(r == 56) - { a[x] = #VPSHUFB_256(a[x], r56); } - else - { t = #VPSLL_4u64(a[x], r); - a[x] = #VPSRL_4u64(a[x], 64 - r); - a[x] |= t; } - } - - return a; -} - -// D[x] = C[x-1] ^ ROT(C[x+1], 1) -inline fn keccakf1600_4x_theta_rol(reg u256[5] c, reg u256 r8 r56) -> reg u256[5] -{ - inline int x; - reg u256[5] d; - - for x = 0 to 5 - { // D[x] = C[x + 1] - d[x] = c[(x + 1) % 5]; - - // D[x] = ROT(D[x], 1) - d = keccakf1600_4x_rol(d, x, 1, r8, r56); - - // D[x] ^= C[x-1] - d[x] ^= c[(x - 1 + 5) % 5]; - } - - return d; -} - - -// B[x] = ROT( (A[x',y'] ^ D[x']), r[x',y'] ) with (x',y') = M^-1 (x,y) -// -// M = (0 1) M^-1 = (1 3) x' = 1x + 3y -// (2 3) (1 0) y' = 1x + 0y -// -inline fn keccakf1600_4x_rol_sum( - reg ptr u256[25] a, - reg u256[5] d, - inline int y, - reg u256 r8 r56 -) -> reg u256[5] -{ - inline int r x x_ y_; - reg u256[5] b; - - for x = 0 to 5 - { - x_ = (x + 3*y) % 5; - y_ = x; - r = keccakf1600_rhotates(x_, y_); - - // B[x] = A[x',y'] - b[x] = a[x_ + y_*5]; - - // B[x] ^= D[x']; - b[x] ^= d[x_]; - - // B[x] = ROT( B[x], r[x',y'] ); - if(r != 0) - { b = keccakf1600_4x_rol(b, x, r, r8, r56); } - } - - return b; -} - - -// E[x, y] = B[x] ^ ( (!B[x+1]) & B[x+2] ) -// -- when x and y are 0: E[0,0] ^= RC[i]; -inline fn keccakf1600_4x_set_row( - reg ptr u256[25] e, - reg u256[5] b, - inline int y, - reg u256 rc -) -> reg ptr u256[25] -{ - inline int x x1 x2; - reg u256 t; - - for x=0 to 5 - { - x1 = (x + 1) % 5; - x2 = (x + 2) % 5; - - t = #VPANDN_256(b[x1], b[x2]); - - t ^= b[x]; - if( x==0 && y==0 ){ t ^= rc; } - e[x + y*5] = t; - } - - return e; -} - - -fn keccakf1600_4x_round(reg ptr u256[25] e a, reg u256 rc r8 r56) -> reg ptr u256[25] -{ - inline int y; - reg u256[5] b c d; - - c = keccakf1600_4x_theta_sum(a); - d = keccakf1600_4x_theta_rol(c, r8, r56); - - for y = 0 to 5 - { b = keccakf1600_4x_rol_sum(a, d, y, r8, r56); - e = keccakf1600_4x_set_row(e, b, y, rc); - } - - return e; -} - -//////////////////////////////////////////////////////////////////////////////// - -inline fn __keccakf1600_4x(reg ptr u256[25] a) -> reg ptr u256[25] -{ - #mmx reg ptr u256[25] a_s; - - reg ptr u256[24] RC; - - stack u256[25] s_e; - reg ptr u256[25] e; - - reg u256 rc r8 r56; - reg u64 c; - - RC = KECCAK1600_RC_AVX2; - e = s_e; - r8 = ROL8; - r56 = ROL56; - - c = 0; - while(c < (KECCAK_ROUNDS*32)) - { - rc = RC.[(int) c]; - e = keccakf1600_4x_round(e, a, rc, r8, r56); - - // just an expensive pointer swap (#todo request feature) - a_s = a; s_e = e; - a = a_s; e = s_e; - - rc = RC.[(int) c + 32]; - a = keccakf1600_4x_round(a, e, rc, r8, r56); - - // just an expensive pointer swap (#todo request feature) - a_s = a; s_e = e; - a = a_s; e = s_e; - - c += 64; - } - - return a; -} - -fn _keccakf1600_4x_(reg ptr u256[25] a) -> reg ptr u256[25] -{ - a = __keccakf1600_4x(a); - return a; -} - -inline fn _keccakf1600_4x(reg ptr u256[25] a) -> reg ptr u256[25] -{ - a = a; - a = _keccakf1600_4x_(a); - a = a; - return a; -} - -// pack 4 keccak states (st25) into a 4-way state (st4x) -inline fn __u256x4_4u64x4 -( reg u256 x0 x1 x2 x3 -) -> reg u256, reg u256, reg u256, reg u256 { - // x0 = l00 l01 l02 l03 - // x1 = l10 l11 l12 l13 - // x2 = l20 l21 l22 l23 - // x3 = l30 l31 l32 l33 - reg u256 y0, y1, y2, y3; - y0 = #VPUNPCKL_4u64(x0, x1); // y0 = l00 l10 l02 l12 - y1 = #VPUNPCKH_4u64(x0, x1); // y1 = l01 l11 l03 l13 - y2 = #VPUNPCKL_4u64(x2, x3); // y2 = l20 l30 l22 l32 - y3 = #VPUNPCKH_4u64(x2, x3); // y3 = l21 l31 l23 l33 - - x0 = #VPERM2I128(y0, y2, 0x20); // x0 = l00 l10 l20 l30 - x1 = #VPERM2I128(y1, y3, 0x20); // x1 = l01 l11 l21 l31 - x2 = #VPERM2I128(y0, y2, 0x31); // x2 = l02 l12 l22 l32 - x3 = #VPERM2I128(y1, y3, 0x31); // x3 = l03 l13 l23 l33 - - return x0, x1, x2, x3; -} - -inline fn __st4x_pack -( reg mut ptr u256[25] st4x -, reg const ptr u64[25] st0 st1 st2 st3 -) -> reg ptr u256[25] { - inline int i; - reg u256 x0, x1, x2, x3; - reg u64 t0, t1, t2, t3; - for i = 0 to 6 { - x0 = st0[u256 i]; - x1 = st1[u256 i]; - x2 = st2[u256 i]; - x3 = st3[u256 i]; - x0, x1, x2, x3 = __u256x4_4u64x4(x0, x1, x2, x3); - st4x[4*i+0] = x0; - st4x[4*i+1] = x1; - st4x[4*i+2] = x2; - st4x[4*i+3] = x3; - } - t0 = st0[24]; - t1 = st1[24]; - t2 = st2[24]; - t3 = st3[24]; - st4x[u64 4*24+0] = t0; - st4x[u64 4*24+1] = t1; - st4x[u64 4*24+2] = t2; - st4x[u64 4*24+3] = t3; - - return st4x; -} - - - -// extracts 4 keccak states (st25) from a 4-way state (st4x) -inline fn __4u64x4_u256x4 -( reg u256 y0 y1 y2 y3 -) -> reg u256, reg u256, reg u256, reg u256 { - // y0 = l00 l10 l20 l30 - // y1 = l01 l11 l21 l31 - // y2 = l02 l12 l22 l32 - // y3 = l03 l13 l23 l33 - reg u256 x0, x1, x2, x3; - x0 = #VPERM2I128(y0, y2, 0x20); // x0 = l00 l10 l02 l12 - x1 = #VPERM2I128(y1, y3, 0x20); // x1 = l01 l11 l03 l13 - x2 = #VPERM2I128(y0, y2, 0x31); // x2 = l20 l30 l22 l32 - x3 = #VPERM2I128(y1, y3, 0x31); // x3 = l21 l31 l23 l33 - - y0 = #VPUNPCKL_4u64(x0, x1); // y0 = l00 l01 l02 l03 - y1 = #VPUNPCKH_4u64(x0, x1); // y1 = l10 l11 l12 l13 - y2 = #VPUNPCKL_4u64(x2, x3); // y2 = l20 l21 l22 l23 - y3 = #VPUNPCKH_4u64(x2, x3); // y3 = l30 l31 l32 l33 - - return y0, y1, y2, y3; -} - -inline fn __st4x_unpack -( reg mut ptr u64[25] st0 st1 st2 st3 -, reg const ptr u256[25] st4x -) -> reg ptr u64[25], reg ptr u64[25], reg ptr u64[25], reg ptr u64[25] { - inline int i; - reg u256 x0, x1, x2, x3; - reg u64 t0, t1, t2, t3; - for i = 0 to 6 { - x0 = st4x[u256 4*i+0]; - x1 = st4x[u256 4*i+1]; - x2 = st4x[u256 4*i+2]; - x3 = st4x[u256 4*i+3]; - x0, x1, x2, x3 = __4u64x4_u256x4(x0, x1, x2, x3); - st0.[u256 4*8*i] = x0; - st1.[u256 4*8*i] = x1; - st2.[u256 4*8*i] = x2; - st3.[u256 4*8*i] = x3; - } - t0 = st4x[u64 4*24+0]; - t1 = st4x[u64 4*24+1]; - t2 = st4x[u64 4*24+2]; - t3 = st4x[u64 4*24+3]; - st0.[u64 8*24] = t0; - st1.[u64 8*24] = t1; - st2.[u64 8*24] = t2; - st3.[u64 8*24] = t3; - - return st0, st1, st2, st3; -} diff --git a/code/jasmin/mlkem_avx2/keccak_OLD/keccakf1600_avx2.jinc b/code/jasmin/mlkem_avx2/keccak_OLD/keccakf1600_avx2.jinc deleted file mode 100644 index bbc0d321..00000000 --- a/code/jasmin/mlkem_avx2/keccak_OLD/keccakf1600_avx2.jinc +++ /dev/null @@ -1,316 +0,0 @@ -require "keccakf1600_generic.jinc" - -u256[24] KECCAK_IOTAS = -{ (4u64)[0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001] - ,(4u64)[0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082] - ,(4u64)[0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a] - ,(4u64)[0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000] - ,(4u64)[0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b] - ,(4u64)[0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001] - ,(4u64)[0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081] - ,(4u64)[0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009] - ,(4u64)[0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a] - ,(4u64)[0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088] - ,(4u64)[0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009] - ,(4u64)[0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a] - ,(4u64)[0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b] - ,(4u64)[0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b] - ,(4u64)[0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089] - ,(4u64)[0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003] - ,(4u64)[0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002] - ,(4u64)[0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080] - ,(4u64)[0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a] - ,(4u64)[0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a] - ,(4u64)[0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081] - ,(4u64)[0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080] - ,(4u64)[0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001] - ,(4u64)[0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008] -}; - - -u256[6] KECCAK_RHOTATES_LEFT = -{ - (4u64)[41, 36, 18, 3], - (4u64)[27, 28, 62, 1], - (4u64)[39, 56, 6, 45], - (4u64)[ 8, 55, 61, 10], - (4u64)[20, 25, 15, 2], - (4u64)[14, 21, 43, 44] -}; - - -u256[6] KECCAK_RHOTATES_RIGHT = -{ - (4u64)[64-41, 64-36, 64-18, 64- 3], - (4u64)[64-27, 64-28, 64-62, 64- 1], - (4u64)[64-39, 64-56, 64- 6, 64-45], - (4u64)[64- 8, 64-55, 64-61, 64-10], - (4u64)[64-20, 64-25, 64-15, 64- 2], - (4u64)[64-14, 64-21, 64-43, 64-44] -}; - - -fn _keccakf1600_avx2(reg u256[7] state) -> reg u256[7] -{ - reg u256[9] t; - reg u256 c00 c14 d00 d14; - - reg bool zf; - reg u64 r iotas_o; - - reg ptr u256[24] iotas_p; - reg ptr u256[6] rhotates_left_p; - reg ptr u256[6] rhotates_right_p; - - iotas_p = KECCAK_IOTAS; - iotas_o = 0; - rhotates_left_p = KECCAK_RHOTATES_LEFT; - rhotates_right_p = KECCAK_RHOTATES_RIGHT; - - r = KECCAK_ROUNDS; - while - { - //######################################## Theta - c00 = #VPSHUFD_256(state[2], (4u2)[1,0,3,2]); - c14 = state[5] ^ state[3]; - t[2] = state[4] ^ state[6]; - c14 = c14 ^ state[1]; - c14 = c14 ^ t[2]; - t[4] = #VPERMQ(c14, (4u2)[2,1,0,3]); - c00 = c00 ^ state[2]; - t[0] = #VPERMQ(c00, (4u2)[1,0,3,2]); - t[1] = c14 >>4u64 63; - t[2] = c14 +4u64 c14; - t[1] = t[1] | t[2]; - d14 = #VPERMQ(t[1], (4u2)[0,3,2,1]); - d00 = t[1] ^ t[4]; - d00 = #VPERMQ(d00, (4u2)[0,0,0,0]); - c00 = c00 ^ state[0]; - c00 = c00 ^ t[0]; - t[0] = c00 >>4u64 63; - t[1] = c00 +4u64 c00; - t[1] = t[1] | t[0]; - state[2] = state[2] ^ d00; - state[0] = state[0] ^ d00; - d14 = #VPBLEND_8u32(d14, t[1], (8u1)[1,1,0,0,0,0,0,0]); - t[4] = #VPBLEND_8u32(t[4], c00, (8u1)[0,0,0,0,0,0,1,1]); - d14 = d14 ^ t[4]; - - //######################################## Rho + Pi + pre-Chi shuffle - t[3] = #VPSLLV_4u64(state[2], rhotates_left_p[0] ); - state[2] = #VPSRLV_4u64(state[2], rhotates_right_p[0] ); - state[2] = state[2] | t[3]; - state[3] = state[3] ^ d14; - t[4] = #VPSLLV_4u64(state[3], rhotates_left_p[2] ); - state[3] = #VPSRLV_4u64(state[3], rhotates_right_p[2] ); - state[3] = state[3] | t[4]; - state[4] = state[4] ^ d14; - t[5] = #VPSLLV_4u64(state[4], rhotates_left_p[3] ); - state[4] = #VPSRLV_4u64(state[4], rhotates_right_p[3] ); - state[4] = state[4] | t[5]; - state[5] = state[5] ^ d14; - t[6] = #VPSLLV_4u64(state[5], rhotates_left_p[4] ); - state[5] = #VPSRLV_4u64(state[5], rhotates_right_p[4] ); - state[5] = state[5] | t[6]; - state[6] = state[6] ^ d14; - t[3] = #VPERMQ(state[2], (4u2)[2,0,3,1]); - t[4] = #VPERMQ(state[3], (4u2)[2,0,3,1]); - t[7] = #VPSLLV_4u64(state[6], rhotates_left_p[5] ); - t[1] = #VPSRLV_4u64(state[6], rhotates_right_p[5] ); - t[1] = t[1] | t[7]; - state[1] = state[1] ^ d14; - t[5] = #VPERMQ(state[4], (4u2)[0,1,2,3]); - t[6] = #VPERMQ(state[5], (4u2)[1,3,0,2]); - t[8] = #VPSLLV_4u64(state[1], rhotates_left_p[1] ); - t[2] = #VPSRLV_4u64(state[1], rhotates_right_p[1] ); - t[2] = t[2] | t[8]; - - //######################################## Chi - t[7] = #VPSRLDQ_256(t[1], 8); - t[0] = !t[1] & t[7]; - state[3] = #VPBLEND_8u32(t[2], t[6], (8u1)[0,0,0,0,1,1,0,0]); - t[8] = #VPBLEND_8u32(t[4], t[2], (8u1)[0,0,0,0,1,1,0,0]); - state[5] = #VPBLEND_8u32(t[3], t[4], (8u1)[0,0,0,0,1,1,0,0]); - t[7] = #VPBLEND_8u32(t[2], t[3], (8u1)[0,0,0,0,1,1,0,0]); - state[3] = #VPBLEND_8u32(state[3], t[4], (8u1)[0,0,1,1,0,0,0,0]); - t[8] = #VPBLEND_8u32(t[8], t[5], (8u1)[0,0,1,1,0,0,0,0]); - state[5] = #VPBLEND_8u32(state[5], t[2], (8u1)[0,0,1,1,0,0,0,0]); - t[7] = #VPBLEND_8u32(t[7], t[6], (8u1)[0,0,1,1,0,0,0,0]); - state[3] = #VPBLEND_8u32(state[3], t[5], (8u1)[1,1,0,0,0,0,0,0]); - t[8] = #VPBLEND_8u32(t[8], t[6], (8u1)[1,1,0,0,0,0,0,0]); - state[5] = #VPBLEND_8u32(state[5], t[6], (8u1)[1,1,0,0,0,0,0,0]); - t[7] = #VPBLEND_8u32(t[7], t[4], (8u1)[1,1,0,0,0,0,0,0]); - state[3] = !state[3] & t[8]; - state[5] = !state[5] & t[7]; - state[6] = #VPBLEND_8u32(t[5], t[2], (8u1)[0,0,0,0,1,1,0,0]); - t[8] = #VPBLEND_8u32(t[3], t[5], (8u1)[0,0,0,0,1,1,0,0]); - state[3] = state[3] ^ t[3]; - state[6] = #VPBLEND_8u32(state[6], t[3], (8u1)[0,0,1,1,0,0,0,0]); - t[8] = #VPBLEND_8u32(t[8], t[4], (8u1)[0,0,1,1,0,0,0,0]); - state[5] = state[5] ^ t[5]; - state[6] = #VPBLEND_8u32(state[6], t[4], (8u1)[1,1,0,0,0,0,0,0]); - t[8] = #VPBLEND_8u32(t[8], t[2], (8u1)[1,1,0,0,0,0,0,0]); - state[6] = !state[6] & t[8]; - state[6] = state[6] ^ t[6]; - state[4] = #VPERMQ(t[1], (4u2)[0,1,3,2]); - t[8] = #VPBLEND_8u32(state[4], state[0], (8u1)[0,0,1,1,0,0,0,0]); - state[1] = #VPERMQ(t[1], (4u2)[0,3,2,1]); - state[1] = #VPBLEND_8u32(state[1], state[0], (8u1)[1,1,0,0,0,0,0,0]); - state[1] = !state[1] & t[8]; - state[2] = #VPBLEND_8u32(t[4], t[5], (8u1)[0,0,0,0,1,1,0,0]); - t[7] = #VPBLEND_8u32(t[6], t[4], (8u1)[0,0,0,0,1,1,0,0]); - state[2] = #VPBLEND_8u32(state[2], t[6], (8u1)[0,0,1,1,0,0,0,0]); - t[7] = #VPBLEND_8u32(t[7], t[3], (8u1)[0,0,1,1,0,0,0,0]); - state[2] = #VPBLEND_8u32(state[2], t[3], (8u1)[1,1,0,0,0,0,0,0]); - t[7] = #VPBLEND_8u32(t[7], t[5], (8u1)[1,1,0,0,0,0,0,0]); - state[2] = !state[2] & t[7]; - state[2] = state[2] ^ t[2]; - t[0] = #VPERMQ(t[0], (4u2)[0,0,0,0]); - state[3] = #VPERMQ(state[3], (4u2)[0,1,2,3]); - state[5] = #VPERMQ(state[5], (4u2)[2,0,3,1]); - state[6] = #VPERMQ(state[6], (4u2)[1,3,0,2]); - state[4] = #VPBLEND_8u32(t[6], t[3], (8u1)[0,0,0,0,1,1,0,0]); - t[7] = #VPBLEND_8u32(t[5], t[6], (8u1)[0,0,0,0,1,1,0,0]); - state[4] = #VPBLEND_8u32(state[4], t[5], (8u1)[0,0,1,1,0,0,0,0]); - t[7] = #VPBLEND_8u32(t[7], t[2], (8u1)[0,0,1,1,0,0,0,0]); - state[4] = #VPBLEND_8u32(state[4], t[2], (8u1)[1,1,0,0,0,0,0,0]); - t[7] = #VPBLEND_8u32(t[7], t[3], (8u1)[1,1,0,0,0,0,0,0]); - state[4] = !state[4] & t[7]; - state[0] = state[0] ^ t[0]; - state[1] = state[1] ^ t[1]; - state[4] = state[4] ^ t[4]; - - //######################################## Iota - state[0] = state[0] ^ iotas_p.[(int) iotas_o]; - iotas_o += 32; - - _,_,_,zf,r = #DEC_64(r); - }(!zf) - - return state; -} - -// converts a (plain) keccak state (st25) into the avx2 representation -inline fn __stavx2_pack -( reg const ptr u64[25] st -) -> reg u256[7] { - // 3*r256 (evitáveis...) - reg u256[7] state; - reg u256 t256_0 t256_1 t256_2; - reg u128 t128_0, t128_1; - reg u64 r; - - // [ 0 0 0 0 ] - state[0] = #VPBROADCAST_4u64(st.[u64 8*0]); - // [ 1 2 3 4 ] - state[1] = st.[u256 1*8]; - // [ 5 - ] - t128_0 = #VMOV(st[5]); - // [ 6 7 8 9 ] - state[3] = st.[u256 6*8]; - // [ 10 - ] - t128_1 = #VMOV(st[10]); - // [ 11 12 13 14 ] - state[4] = st.[u256 11*8]; - // [ 5 15 ] - r = st[15]; - t128_0 = #VPINSR_2u64(t128_0, r, 1); - // [ 16 17 18 19 ] - state[5] = st.[u256 16*8]; - // [ 10 20 ] - r = st[20]; - t128_1 = #VPINSR_2u64(t128_1, r, 1); - // alternative not currently supported: VPGATHERDQ for filling state[2] - // [ 10 20 5 15 ] - state[2] = (2u128)[t128_0, t128_1]; - // [ 21 22 23 24 ] - state[6] = st.[u256 21*8]; - - // [ 16 7 8 19 ] - t256_0 = #VPBLEND_8u32(state[3], state[5], (8u1)[1,1,0,0,0,0,1,1]); - // [ 11 22 23 14 ] - t256_1 = #VPBLEND_8u32(state[6], state[4], (8u1)[1,1,0,0,0,0,1,1]); - // [ 6 12 13 9 ] - t256_2 = #VPBLEND_8u32(state[4], state[3], (8u1)[1,1,0,0,0,0,1,1]); - - // [ 16 7 23 14 ] - state[3] = #VPBLEND_8u32(t256_0, t256_1, (8u1)[1,1,1,1,0,0,0,0]); - // [ 11 22 8 19 ] - state[4] = #VPBLEND_8u32(t256_1, t256_0, (8u1)[1,1,1,1,0,0,0,0]); - - // [ 21 17 18 24 ] - t256_0 = #VPBLEND_8u32(state[5], state[6], (8u1)[1,1,0,0,0,0,1,1]); - - // [ 21 17 13 9 ] - state[5] = #VPBLEND_8u32(t256_0, t256_2, (8u1)[1,1,1,1,0,0,0,0]); - // [ 6 12 18 24 ] - state[6] = #VPBLEND_8u32(t256_2, t256_0, (8u1)[1,1,1,1,0,0,0,0]); - - // [ 0 0 0 0 ] - // [ 1 2 3 4 ] - // [ 10 20 5 15 ] - // [ 16 7 23 14 ] - // [ 11 22 8 19 ] - // [ 21 17 13 9 ] - // [ 6 12 18 24 ] - return state; -} - -// recovers a (plain) keccak state (st25) from an avx2-encoded one -inline fn __stavx2_unpack -( reg mut ptr u64[25] st -, reg u256[7] state -) -> reg ptr u64[25] { - // 5*r256 + 2*r128(evitáveis) (+7*r256) - reg u256 t256_0 t256_1 t256_2 t256_3 t256_4; - reg u128 t128_0, t128_1; - - // [ 0, 0 ] - t128_0 = (128u) state[0]; - st[0] = #VMOVLPD(t128_0); - // [ 1, 2, 3, 4 ] - st.[u256 1*8] = state[1]; - - // [ 16, 7, 8, 19 ] - t256_0 = #VPBLEND_8u32(state[3], state[4], (8u1)[1,1,1,1,0,0,0,0]); - // [ 11, 22, 23, 14 ] - t256_1 = #VPBLEND_8u32(state[4], state[3], (8u1)[1,1,1,1,0,0,0,0]); - // [ 21, 17, 18, 24 ] - t256_2 = #VPBLEND_8u32(state[5], state[6], (8u1)[1,1,1,1,0,0,0,0]); - // [ 6, 12, 13, 9 ] - t256_3 = #VPBLEND_8u32(state[6], state[5], (8u1)[1,1,1,1,0,0,0,0]); - - // [ 5, 15 ] -// state[2] = TTT[0]; - t128_1 = #VEXTRACTI128(state[2], 1); - st[5] = #VMOVLPD(t128_1); - - // [ 6, 7, 8, 9 ] - t256_4 = #VPBLEND_8u32(t256_0, t256_3, (8u1)[1,1,0,0,0,0,1,1]); - st.[u256 6*8] = t256_4; - - // [ 10, 20 ] - t128_0 = (128u) state[2]; - st[10] = #VMOVLPD(t128_0); - - // [ 11, 12, 13, 14 ] - t256_4 = #VPBLEND_8u32(t256_3, t256_1, (8u1)[1,1,0,0,0,0,1,1]); - st.[u256 11*8] = t256_4; - - // [ 15 ] - st[15] = #VMOVHPD(t128_1); - - // [ 16, 17, 18, 19 ] - t256_4 = #VPBLEND_8u32(t256_2, t256_0, (8u1)[1,1,0,0,0,0,1,1]); - st.[u256 16*8] = t256_4; - - // [ 20 ] - st[20] = #VMOVHPD(t128_0); - - // [ 21, 22, 23, 24 ] - t256_4 = #VPBLEND_8u32(t256_1, t256_2, (8u1)[1,1,0,0,0,0,1,1]); - st.[u256 21*8] = t256_4; - - return st; -} - diff --git a/code/jasmin/mlkem_avx2/keccak_OLD/keccakf1600_generic.jinc b/code/jasmin/mlkem_avx2/keccak_OLD/keccakf1600_generic.jinc deleted file mode 100644 index c11a69b8..00000000 --- a/code/jasmin/mlkem_avx2/keccak_OLD/keccakf1600_generic.jinc +++ /dev/null @@ -1,64 +0,0 @@ -param int KECCAK_ROUNDS = 24; - -u64[24] KECCAK1600_RC = -{ 0x0000000000000001 - ,0x0000000000008082 - ,0x800000000000808a - ,0x8000000080008000 - ,0x000000000000808b - ,0x0000000080000001 - ,0x8000000080008081 - ,0x8000000000008009 - ,0x000000000000008a - ,0x0000000000000088 - ,0x0000000080008009 - ,0x000000008000000a - ,0x000000008000808b - ,0x800000000000008b - ,0x8000000000008089 - ,0x8000000000008003 - ,0x8000000000008002 - ,0x8000000000000080 - ,0x000000000000800a - ,0x800000008000000a - ,0x8000000080008081 - ,0x8000000000008080 - ,0x0000000080000001 - ,0x8000000080008008 -}; - -inline fn keccakf1600_index(inline int x y) -> inline int -{ - inline int r; - r = (x % 5) + 5 * (y % 5); - return r; -} - - -inline fn keccakf1600_rho_offsets(inline int i) -> inline int -{ - inline int r x y z t; - - r = 0; - x = 1; - y = 0; - - for t = 0 to 24 - { if (i == x + 5 * y) - { r = ((t + 1) * (t + 2) / 2) % 64; } - z = (2 * x + 3 * y) % 5; - x = y; - y = z; - } - - return r; -} - -inline fn keccakf1600_rhotates(inline int x y) -> inline int -{ - inline int i r; - i = keccakf1600_index(x, y); - r = keccakf1600_rho_offsets(i); - return r; -} - diff --git a/code/jasmin/mlkem_avx2/mlkem_keccak_avx2_TRANSITION.jinc b/code/jasmin/mlkem_avx2/mlkem_keccak_avx2_TRANSITION.jinc deleted file mode 100644 index 66d7d7f5..00000000 --- a/code/jasmin/mlkem_avx2/mlkem_keccak_avx2_TRANSITION.jinc +++ /dev/null @@ -1,175 +0,0 @@ - -namespace OLD_KECCAK { -require "keccak_OLD/fips202.jinc" -require "keccak_OLD/fips202_4x.jinc" - -inline fn _sha3_256A_M1184 -( #spill_to_mmx reg mut ptr u8[32] out -, #spill_to_mmx reg u64 in -) -> reg ptr u8[32] -{ reg u64 inlen; - inlen = 1184; - out = _isha3_256(out, in, inlen); - return out; -} - -inline fn _shake256_M32__M32_M1088 -( reg u64 out -, reg u64 in0 in1 // 32+MLKEM_INDCPA_CIPHERTEXTBYTES -) -{ _shake256_1120_32(out, in0, in1); } - -inline fn _shake256_A128__A32_A1 -( reg mut ptr u8[128] out -, reg const ptr u8[32] seed -, reg const ptr u8[1] nonce -) -> reg ptr u8[128] -{ reg u256 t256; - reg u8 t8; - stack u8[33] in_s; - reg ptr u8[33] in; - in = in_s; - t256 = seed[u256 0]; - in[u256 0] = t256; - t8 = nonce[0]; - in[32] = t8; - out = _shake256_128_33(out, in); - return out; -} - -inline fn _shake256x4_A128__A32_A1 -( reg mut ptr u8[128] out0 out1 out2 out3 -, reg const ptr u8[32] seed -, reg const ptr u8[4] nonce -) -> reg ptr u8[128] /* out0 */ - , reg ptr u8[128] /* out1 */ - , reg ptr u8[128] /* out2 */ - , reg ptr u8[128] /* out3 */ -{ reg u256 t256; - reg u8 t8; - stack u8[33] in0 in1 in2 in3; - stack u256[25] st_s; - reg ptr u256[25] st; - st = st_s; - t256 = seed[u256 0]; - in0[u256 0] = t256; - t8 = nonce[0]; - in0[32] = t8; - t8 = nonce[1]; - in1[u256 0] = t256; - in1[32] = t8; - t8 = nonce[2]; - in2[u256 0] = t256; - in2[32] = t8; - t8 = nonce[3]; - in3[u256 0] = t256; - in3[32] = t8; - st = _shake256_absorb4x_33(st, in0, in1, in2, in3); - st, out0, out1, out2, out3 = __shake256_squeezeblock4xTRANSITION(st, out0, out1, out2, out3); - return out0, out1, out2, out3; -} - -inline fn _sha3_256A_A32 -( #spill_to_mmx reg mut ptr u8[32] out -, reg const ptr u8[32] in -) -> reg ptr u8[32] -{ out = _isha3_256_32(out, in); return out; } - -inline fn _sha3_512A_A64 -( reg mut ptr u8[64] out -, reg const ptr u8[64] in -) -> reg ptr u8[64] -{ out = _sha3_512_64(out, in); return out; } - -inline fn _sha3_512A_A32 -( reg mut ptr u8[64] out -, reg const ptr u8[32] in -) -> reg ptr u8[64] -{ out = _sha3_512_32(out, in); return out; } - -} // OLD_KECCAK - -namespace NEW_KECCAK { -require "mlkem_keccak_avx2.jinc" -} - -inline fn _sha3_256A_M1184 -( #spill_to_mmx reg mut ptr u8[32] out -, #spill_to_mmx reg u64 in -) -> reg ptr u8[32] -{ out = NEW_KECCAK::_sha3_256A_M1184(out, in); return out; } - - -inline fn _shake256_M32__M32_M1088 -( reg u64 out -, reg u64 in0 in1 // 32+MLKEM_INDCPA_CIPHERTEXTBYTES -) -{ NEW_KECCAK::_shake256_M32__M32_M1088(out, in0, in1); } - -inline fn _shake256_A128__A32_A1 -( reg mut ptr u8[128] out -, reg const ptr u8[32] seed -, reg const ptr u8[1] nonce -) -> reg ptr u8[128] -{ out = NEW_KECCAK::_shake256_A128__A32_A1(out, seed, nonce); return out; } - -inline fn _shake256x4_A128__A32_A1 -( reg mut ptr u8[128] out0 out1 out2 out3 -, reg const ptr u8[32] seed -, reg const ptr u8[4] nonces -) -> reg ptr u8[128] /* out0 */ - , reg ptr u8[128] /* out1 */ - , reg ptr u8[128] /* out2 */ - , reg ptr u8[128] /* out3 */ -{ out0, out1, out2, out3 = NEW_KECCAK::_shake256x4_A128__A32_A1(out0, out1, out2, out3, seed, nonces); return out0, out1, out2, out3; } - -inline fn _sha3_256A_A32 -( #spill_to_mmx reg mut ptr u8[32] out -, reg const ptr u8[32] in -) -> reg ptr u8[32] -{ out = NEW_KECCAK::_sha3_256A_A32(out, in); return out; } - -inline fn _sha3_512A_A64 -( reg mut ptr u8[64] out -, reg const ptr u8[64] in -) -> reg ptr u8[64] -{ out = NEW_KECCAK::_sha3_512A_A64(out, in); return out; } - -inline fn _sha3_512A_A32 -( reg mut ptr u8[64] out -, reg const ptr u8[32] in -) -> reg ptr u8[64] -{ out = NEW_KECCAK::_sha3_512A_A32(out, in); return out; } - - -// Only available on the new version!!! -inline fn _shake128_absorb_A32_A2 -( reg const ptr u8[32] seed -, reg const ptr u8[2] pos -) -> reg u256[7] -{ reg u256[7] st; st = NEW_KECCAK::_shake128_absorb_A32_A2(seed, pos); return st; } - -inline fn _shake128x4_absorb_A32_A2 -( reg mut ptr u256[25] st -, reg const ptr u8[32] seed -, reg const ptr u8[8] pos -) -> reg ptr u256[25] -{ st = NEW_KECCAK::_shake128x4_absorb_A32_A2(st, seed, pos); return st; } - -inline fn _shake128_squeeze3blocks -( reg mut ptr u8[536] buf -, reg u256[7] st -) -> reg ptr u8[536] -{ buf = NEW_KECCAK::_shake128_squeeze3blocks( buf, st); return buf; } - -inline fn _shake128_next_state -( reg mut ptr u8[536] buf -) -> reg ptr u8[536] /* buf */ -{ buf = NEW_KECCAK::_shake128_next_state(buf); return buf; } - -inline fn _shake128x4_squeeze3blocks -( reg mut ptr u256[25] st -, reg mut ptr u8[4*536] buf -) -> reg ptr u256[25] /* st */ - , reg ptr u8[4*536] /* buf */ -{ st, buf = NEW_KECCAK::_shake128x4_squeeze3blocks(st, buf); return st, buf; } diff --git a/code/jasmin/mlkem_avx2/poly.jinc b/code/jasmin/mlkem_avx2/poly.jinc index 6a528888..409e8900 100644 --- a/code/jasmin/mlkem_avx2/poly.jinc +++ b/code/jasmin/mlkem_avx2/poly.jinc @@ -3,12 +3,7 @@ require "shuffle.jinc" require "consts.jinc" require "reduce.jinc" -require "mlkem_keccak_avx2_TRANSITION.jinc" -require "keccak_OLD/fips202_common.jinc" -/* replaced by "mlkem_keccak_avx2"... -require "fips202.jinc" -require "fips202_4x.jinc" -*/ +require "mlkem_keccak_avx2.jinc" fn _poly_add2(reg ptr u16[MLKEM_N] rp bp) -> stack u16[MLKEM_N] { @@ -537,7 +532,7 @@ fn _poly_frommsg_1(reg ptr u16[MLKEM_N] rp, reg ptr u8[32] ap) -> stack u16[MLKE return rp; } - +param int SHAKE256_RATE = 136; param int NOISE_NBLOCKS = (MLKEM_ETA1 * MLKEM_N/4 + SHAKE256_RATE - 1)/SHAKE256_RATE; u8[32] cbd_jshufbidx = {0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1, diff --git a/proof/correctness/avx2/AVX2_Ops.ec b/proof/correctness/avx2/AVX2_Ops.ec index dd8e8e56..27b00517 100644 --- a/proof/correctness/avx2/AVX2_Ops.ec +++ b/proof/correctness/avx2/AVX2_Ops.ec @@ -2142,7 +2142,9 @@ proof. rewrite eq_p eq_x //=. do (rewrite -get_unpack32 1:/# pack4K //=). do (rewrite get_of_list 1:/#). - smt(@Array4 @Array8 @List). +congr. +apply W8u32.Pack.all_eq_eq. +by rewrite /all_eq /= /#. qed. equiv eq_iVPSHUFB_256 : Ops.iVPSHUFB_256 ~ OpsV.iVPSHUFB_256: