From c448489f0ef9d8740980a7a9bd151c46f80257e7 Mon Sep 17 00:00:00 2001 From: Michael Schaffner Date: Tue, 16 Jan 2024 16:58:27 -0800 Subject: [PATCH] [prim, rom_ctrl] Remove S&P layer from data scrambling As elaborated on #20788, the S&P layer is disabled in the SRAM scrambling devices in order to improve error detection guarantees, interactions with ECC and timing. In order to minimize changes and keep the implementation around in case it is needed for byte parity at some point, we just set the NumDiffRounds parameter to zero for the modules that leverage prim_ram_1p_scr. In case of rom_ctrl, the functionality is removed entirely. Signed-off-by: Michael Schaffner --- hw/dv/sv/mem_bkdr_util/mem_bkdr_util__rom.sv | 1 - hw/dv/sv/mem_bkdr_util/sram_scrambler_pkg.sv | 106 +++++++++--------- .../verilator/cpp/scrambled_ecc32_mem_area.cc | 10 +- hw/ip/otbn/rtl/otbn.sv | 4 +- .../dv/prim_ram_scr/cpp/scramble_model.cc | 24 ++-- hw/ip/prim/rtl/prim_ram_1p_scr.sv | 8 +- hw/ip/rom_ctrl/doc/rom_ctrl_blockdiag.svg | 57 +++------- hw/ip/rom_ctrl/doc/theory_of_operation.md | 2 +- hw/ip/rom_ctrl/rtl/rom_ctrl_scrambled_rom.sv | 16 +-- hw/ip/sram_ctrl/doc/sram_ctrl_blockdiag.svg | 2 +- hw/ip/sram_ctrl/doc/theory_of_operation.md | 5 +- hw/ip/sram_ctrl/rtl/sram_ctrl.sv | 3 +- 12 files changed, 105 insertions(+), 133 deletions(-) diff --git a/hw/dv/sv/mem_bkdr_util/mem_bkdr_util__rom.sv b/hw/dv/sv/mem_bkdr_util/mem_bkdr_util__rom.sv index d021ca569d10cf..e96c15bc4171fa 100644 --- a/hw/dv/sv/mem_bkdr_util/mem_bkdr_util__rom.sv +++ b/hw/dv/sv/mem_bkdr_util/mem_bkdr_util__rom.sv @@ -59,7 +59,6 @@ virtual function bit [38:0] rom_encrypt_read32(bit [bus_params_pkg::BUS_AW-1:0] zero_key[i] = '0; end - data_arr = sram_scrambler_pkg::sp_decrypt(data_arr, 39, zero_key); for (int i = 0; i < 39; i++) begin data[i] = data_arr[i] ^ keystream[i]; end diff --git a/hw/dv/sv/mem_bkdr_util/sram_scrambler_pkg.sv b/hw/dv/sv/mem_bkdr_util/sram_scrambler_pkg.sv index f832a5f5b319b6..41c1ee4eb84b0f 100644 --- a/hw/dv/sv/mem_bkdr_util/sram_scrambler_pkg.sv +++ b/hw/dv/sv/mem_bkdr_util/sram_scrambler_pkg.sv @@ -237,10 +237,10 @@ package sram_scrambler_pkg; // SRAM data encryption is more involved, we need to run 2 rounds of PRINCE on the nonce and key // and then XOR the result with the data. // - // After that, the XORed data neeeds to them be passed through the S&P network one byte at a time. + // Optionally, the XORed data can be passed through the S&P network. function automatic state_t encrypt_sram_data(logic data[], int data_width, int sp_width, logic addr[], int addr_width, - logic key[], logic nonce[]); + logic key[], logic nonce[], bit use_sp_layer = 0); logic keystream[] = new[SRAM_BLOCK_WIDTH]; logic data_enc[] = new[data_width]; logic byte_to_enc[] = new[8]; @@ -262,31 +262,33 @@ package sram_scrambler_pkg; data_enc[i] = data[i] ^ keystream[i % ks_width]; end - if (data_width == sp_width) begin - // pass the entire word through the subst/perm network at once (the next cases would give the - // same results too, but this should be a bit more efficient) - data_enc = sp_encrypt(data_enc, data_width, zero_key); - end else if (sp_width == 8) begin - // pass each byte of the encoded result through the subst/perm network (special case of the - // general code below) - for (int i = 0; i < data_width / 8; i++) begin - byte_to_enc = data_enc[i*8 +: 8]; - enc_byte = sp_encrypt(byte_to_enc, 8, zero_key); - data_enc[i*8 +: 8] = enc_byte; - end - end else begin - // divide the word into sp_width chunks to pass it through the subst/perm network - for (int chunk_lsb = 0; chunk_lsb < data_width; chunk_lsb += sp_width) begin - int bits_remaining = data_width - chunk_lsb; - int chunk_width = (bits_remaining < sp_width) ? bits_remaining : sp_width; - logic chunk[] = new[chunk_width]; - - for (int j = 0; j < chunk_width; j++) begin - chunk[j] = data_enc[chunk_lsb + j]; + if (use_sp_layer) begin + if (data_width == sp_width) begin + // pass the entire word through the subst/perm network at once (the next cases would give the + // same results too, but this should be a bit more efficient) + data_enc = sp_encrypt(data_enc, data_width, zero_key); + end else if (sp_width == 8) begin + // pass each byte of the encoded result through the subst/perm network (special case of the + // general code below) + for (int i = 0; i < data_width / 8; i++) begin + byte_to_enc = data_enc[i*8 +: 8]; + enc_byte = sp_encrypt(byte_to_enc, 8, zero_key); + data_enc[i*8 +: 8] = enc_byte; end - chunk = sp_encrypt(chunk, chunk_width, zero_key); - for (int j = 0; j < chunk_width; j++) begin - data_enc[chunk_lsb + j] = chunk[j]; + end else begin + // divide the word into sp_width chunks to pass it through the subst/perm network + for (int chunk_lsb = 0; chunk_lsb < data_width; chunk_lsb += sp_width) begin + int bits_remaining = data_width - chunk_lsb; + int chunk_width = (bits_remaining < sp_width) ? bits_remaining : sp_width; + logic chunk[] = new[chunk_width]; + + for (int j = 0; j < chunk_width; j++) begin + chunk[j] = data_enc[chunk_lsb + j]; + end + chunk = sp_encrypt(chunk, chunk_width, zero_key); + for (int j = 0; j < chunk_width; j++) begin + data_enc[chunk_lsb + j] = chunk[j]; + end end end end @@ -296,7 +298,7 @@ package sram_scrambler_pkg; function automatic state_t decrypt_sram_data(logic data[], int data_width, int sp_width, logic addr[], int addr_width, - logic key[], logic nonce[]); + logic key[], logic nonce[], bit use_sp_layer = 0); logic keystream[] = new[SRAM_BLOCK_WIDTH]; logic data_dec[] = new[data_width]; logic byte_to_dec[] = new[8]; @@ -312,31 +314,33 @@ package sram_scrambler_pkg; // Generate the keystream keystream = gen_keystream(addr, addr_width, key, nonce); - if (data_width == sp_width) begin - // pass the entire word through the subst/perm network at once (the next cases would give the - // same results too, but this should be a bit more efficient) - data_dec = sp_decrypt(data, data_width, zero_key); - end else if (sp_width == 8) begin - // pass each byte of the data through the subst/perm network (special case of the general code - // below) - for (int i = 0; i < data_width / 8; i++) begin - byte_to_dec = data[i*8 +: 8]; - dec_byte = sp_decrypt(byte_to_dec, 8, zero_key); - data_dec[i*8 +: 8] = dec_byte; - end - end else begin - // divide the word into sp_width chunks to pass it through the subst/perm network - for (int chunk_lsb = 0; chunk_lsb < data_width; chunk_lsb += sp_width) begin - int bits_remaining = data_width - chunk_lsb; - int chunk_width = (bits_remaining < sp_width) ? bits_remaining : sp_width; - logic chunk[] = new[chunk_width]; - - for (int j = 0; j < chunk_width; j++) begin - chunk[j] = data[chunk_lsb + j]; + if (use_sp_layer) begin + if (data_width == sp_width) begin + // pass the entire word through the subst/perm network at once (the next cases would give the + // same results too, but this should be a bit more efficient) + data_dec = sp_decrypt(data, data_width, zero_key); + end else if (sp_width == 8) begin + // pass each byte of the data through the subst/perm network (special case of the general code + // below) + for (int i = 0; i < data_width / 8; i++) begin + byte_to_dec = data[i*8 +: 8]; + dec_byte = sp_decrypt(byte_to_dec, 8, zero_key); + data_dec[i*8 +: 8] = dec_byte; end - chunk = sp_decrypt(chunk, chunk_width, zero_key); - for (int j = 0; j < chunk_width; j++) begin - data_dec[chunk_lsb + j] = chunk[j]; + end else begin + // divide the word into sp_width chunks to pass it through the subst/perm network + for (int chunk_lsb = 0; chunk_lsb < data_width; chunk_lsb += sp_width) begin + int bits_remaining = data_width - chunk_lsb; + int chunk_width = (bits_remaining < sp_width) ? bits_remaining : sp_width; + logic chunk[] = new[chunk_width]; + + for (int j = 0; j < chunk_width; j++) begin + chunk[j] = data[chunk_lsb + j]; + end + chunk = sp_decrypt(chunk, chunk_width, zero_key); + for (int j = 0; j < chunk_width; j++) begin + data_dec[chunk_lsb + j] = chunk[j]; + end end end end diff --git a/hw/dv/verilator/cpp/scrambled_ecc32_mem_area.cc b/hw/dv/verilator/cpp/scrambled_ecc32_mem_area.cc index 65f2b36b610ada..1206c8656990c0 100644 --- a/hw/dv/verilator/cpp/scrambled_ecc32_mem_area.cc +++ b/hw/dv/verilator/cpp/scrambled_ecc32_mem_area.cc @@ -162,9 +162,10 @@ void ScrambledEcc32MemArea::WriteBuffer(uint8_t buf[SV_MEM_WIDTH_BYTES], std::vector ScrambledEcc32MemArea::ReadUnscrambled( const uint8_t buf[SV_MEM_WIDTH_BYTES], uint32_t src_word) const { std::vector scrambled_data(buf, buf + GetPhysWidthByte()); - return scramble_decrypt_data( - scrambled_data, GetPhysWidth(), 39, AddrIntToBytes(src_word, addr_width_), - addr_width_, GetScrambleNonce(), GetScrambleKey(), repeat_keystream_); + return scramble_decrypt_data(scrambled_data, GetPhysWidth(), 39, + AddrIntToBytes(src_word, addr_width_), + addr_width_, GetScrambleNonce(), + GetScrambleKey(), repeat_keystream_, false); } void ScrambledEcc32MemArea::ReadBuffer(std::vector &data, @@ -196,7 +197,8 @@ void ScrambledEcc32MemArea::ScrambleBuffer(uint8_t buf[SV_MEM_WIDTH_BYTES], // Scramble data with integrity scramble_buf = scramble_encrypt_data( scramble_buf, GetPhysWidth(), 39, AddrIntToBytes(dst_word, addr_width_), - addr_width_, GetScrambleNonce(), GetScrambleKey(), repeat_keystream_); + addr_width_, GetScrambleNonce(), GetScrambleKey(), repeat_keystream_, + false); // Copy scrambled data to write buffer std::copy(scramble_buf.begin(), scramble_buf.end(), &buf[0]); diff --git a/hw/ip/otbn/rtl/otbn.sv b/hw/ip/otbn/rtl/otbn.sv index 03c0b01cc13fe0..d35f08c9aa6b18 100644 --- a/hw/ip/otbn/rtl/otbn.sv +++ b/hw/ip/otbn/rtl/otbn.sv @@ -332,8 +332,7 @@ module otbn .Width (39), .Depth (ImemSizeWords), .DataBitsPerMask(39), - .EnableParity (0), - .DiffWidth (39) + .EnableParity (0) ) u_imem ( .clk_i, .rst_ni(rst_n), @@ -535,7 +534,6 @@ module otbn .Depth (DmemSizeWords), .DataBitsPerMask (39), .EnableParity (0), - .DiffWidth (39), .ReplicateKeyStream(1) ) u_dmem ( .clk_i, diff --git a/hw/ip/prim/dv/prim_ram_scr/cpp/scramble_model.cc b/hw/ip/prim/dv/prim_ram_scr/cpp/scramble_model.cc index e8f7d7af8b053c..223703bf1f2ea4 100644 --- a/hw/ip/prim/dv/prim_ram_scr/cpp/scramble_model.cc +++ b/hw/ip/prim/dv/prim_ram_scr/cpp/scramble_model.cc @@ -322,7 +322,7 @@ std::vector scramble_encrypt_data( const std::vector &data_in, uint32_t data_width, uint32_t subst_perm_width, const std::vector &addr, uint32_t addr_width, const std::vector &nonce, - const std::vector &key, bool repeat_keystream) { + const std::vector &key, bool repeat_keystream, bool use_sp_layer) { assert(data_in.size() == ((data_width + 7) / 8)); assert(addr.size() == ((addr_width + 7) / 8)); @@ -335,22 +335,30 @@ std::vector scramble_encrypt_data( auto data_enc = xor_vectors(data_in, keystream); - return scramble_subst_perm_full_width(data_enc, data_width, subst_perm_width, - true); + if (use_sp_layer) { + return scramble_subst_perm_full_width(data_enc, data_width, + subst_perm_width, true); + } else { + return data_enc; + } } std::vector scramble_decrypt_data( const std::vector &data_in, uint32_t data_width, uint32_t subst_perm_width, const std::vector &addr, uint32_t addr_width, const std::vector &nonce, - const std::vector &key, bool repeat_keystream) { + const std::vector &key, bool repeat_keystream, bool use_sp_layer) { assert(data_in.size() == ((data_width + 7) / 8)); assert(addr.size() == ((addr_width + 7) / 8)); - // Data is decrypted by reversing substitution/permutation layer then XORing - // with keystream - auto data_sp_out = scramble_subst_perm_full_width(data_in, data_width, - subst_perm_width, false); + if (use_sp_layer) { + // Data is decrypted by reversing substitution/permutation layer then XORing + // with keystream + auto data_sp_out = scramble_subst_perm_full_width(data_in, data_width, + subst_perm_width, false); + } else { + auto data_sp_out = data_in; + } auto keystream = scramble_gen_keystream(addr, addr_width, nonce, key, data_width, diff --git a/hw/ip/prim/rtl/prim_ram_1p_scr.sv b/hw/ip/prim/rtl/prim_ram_1p_scr.sv index 30655819c6d7ac..d69dfd68d925b8 100644 --- a/hw/ip/prim/rtl/prim_ram_1p_scr.sv +++ b/hw/ip/prim/rtl/prim_ram_1p_scr.sv @@ -34,8 +34,12 @@ module prim_ram_1p_scr import prim_ram_1p_pkg::*; #( // to 2*5 + 1 effective rounds. Setting this to 2 halves this to approximately 5 effective rounds. // Number of PRINCE half rounds, can be [1..5] parameter int NumPrinceRoundsHalf = 2, - // Number of extra diffusion rounds. Setting this to 0 to disable diffusion. - parameter int NumDiffRounds = 2, + // Number of extra diffusion rounds. Setting this to 0 to disables diffusion. + // NOTE: this is disabled by default, since the non-linear transformation of data bits can + // interact adversely with end-to-end ECC integrity. Only enable this if you know what you are + // doing (e.g. using this primitive in a different context with byte parity). + // See also #20788 for more context. + parameter int NumDiffRounds = 0, // This parameter governs the block-width of additional diffusion layers. // For intra-byte diffusion, set this parameter to 8. parameter int DiffWidth = DataBitsPerMask, diff --git a/hw/ip/rom_ctrl/doc/rom_ctrl_blockdiag.svg b/hw/ip/rom_ctrl/doc/rom_ctrl_blockdiag.svg index 969c1f57c57822..f4069c8d40ec1c 100644 --- a/hw/ip/rom_ctrl/doc/rom_ctrl_blockdiag.svg +++ b/hw/ip/rom_ctrl/doc/rom_ctrl_blockdiag.svg @@ -1,12 +1,5 @@ + height="779.99994" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns="http://www.w3.org/2000/svg" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:dc="http://purl.org/dc/elements/1.1/"> @@ -716,13 +716,13 @@ guidetolerance="10" inkscape:pageopacity="0" inkscape:pageshadow="2" - inkscape:window-width="1920" - inkscape:window-height="1043" + inkscape:window-width="3440" + inkscape:window-height="1376" id="namedview2851" showgrid="false" inkscape:zoom="1.7927814" - inkscape:cx="1069.3263" - inkscape:cy="296.72112" + inkscape:cx="1069.5671" + inkscape:cy="296.46671" inkscape:window-x="0" inkscape:window-y="0" inkscape:window-maximized="1" @@ -733,7 +733,8 @@ fit-margin-top="0" fit-margin-left="0" fit-margin-right="0" - fit-margin-bottom="0"> + fit-margin-bottom="0" + inkscape:pagecheckerboard="0"> S&P - - - S&P - (trunc) - \ No newline at end of file + \ No newline at end of file diff --git a/hw/ip/sram_ctrl/doc/theory_of_operation.md b/hw/ip/sram_ctrl/doc/theory_of_operation.md index f3de13bd8554cc..206d8345554294 100644 --- a/hw/ip/sram_ctrl/doc/theory_of_operation.md +++ b/hw/ip/sram_ctrl/doc/theory_of_operation.md @@ -26,11 +26,8 @@ The individual mechanisms are explained in more detail in the subsections below. ## Scrambling Primitive As explained in [`prim_ram_1p_scr`](../../prim/doc/prim_ram_1p_scr.md) the scrambling mechanism employs a reduced-round PRINCE block cipher in CTR mode to scramble the data. -Since plain CTR mode does not diffuse the data bits due to the bitwise XOR, the scheme is augmented by passing each word through a shallow substitution-permutation (S&P) network implemented with the `prim_subst_perm` primitive. -The S&P network employed is similar to the one employed in PRESENT and is explained in more detail [here](../../prim/doc/prim_ram_1p_scr.md#custom-substitution-permutation-network). -Another CTR mode augmentation that is aimed at breaking the linear address space is SRAM address scrambling. -The same S&P network construction that is used for intra-word diffusion is leveraged to non-linearly remap the SRAM address as shown in the block diagram above. +In order to break the linear address space, the CTR mode is augmented with an S&P network to non-linearly remap the SRAM address as shown in the block diagram above. The S&P network employed is similar to the one employed in PRESENT and is explained in more detail [here](../../prim/doc/prim_ram_1p_scr.md#custom-substitution-permutation-network). ### Integrity Error Handling diff --git a/hw/ip/sram_ctrl/rtl/sram_ctrl.sv b/hw/ip/sram_ctrl/rtl/sram_ctrl.sv index e2185e0214b19c..7a1035e5dd0437 100644 --- a/hw/ip/sram_ctrl/rtl/sram_ctrl.sv +++ b/hw/ip/sram_ctrl/rtl/sram_ctrl.sv @@ -492,8 +492,7 @@ module sram_ctrl .Width(DataWidth), .Depth(Depth), .EnableParity(0), - .DataBitsPerMask(DataWidth), - .DiffWidth(DataWidth) + .DataBitsPerMask(DataWidth) ) u_prim_ram_1p_scr ( .clk_i, .rst_ni,