From b069018e2fe1fc613f5eeecc810050210e845528 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Mon, 4 Nov 2024 17:35:04 +0100 Subject: [PATCH 01/12] hw/char/sifive_uart: Fix broken UART on big endian hosts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Casting a "uint32_t *" to a "uint8_t *" to get to the lowest 8-bit part of the value does not work on big endian hosts. We've got to take the proper detour through an 8-bit variable. Fixes: 53c1557b23 ("hw/char: sifive_uart: Print uart characters async") Signed-off-by: Thomas Huth Reviewed-by: Alistair Francis Reviewed-by: Peter Maydell Reviewed-by: Philippe Mathieu-Daudé Message-ID: <20241104163504.305955-1-thuth@redhat.com> Signed-off-by: Alistair Francis --- hw/char/sifive_uart.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/char/sifive_uart.c b/hw/char/sifive_uart.c index aeb45d3601..5ae2a29ed6 100644 --- a/hw/char/sifive_uart.c +++ b/hw/char/sifive_uart.c @@ -174,10 +174,11 @@ sifive_uart_write(void *opaque, hwaddr addr, { SiFiveUARTState *s = opaque; uint32_t value = val64; + uint8_t ch = value; switch (addr) { case SIFIVE_UART_TXFIFO: - sifive_uart_write_tx_fifo(s, (uint8_t *) &value, 1); + sifive_uart_write_tx_fifo(s, &ch, 1); return; case SIFIVE_UART_IE: s->ie = val64; From cd5d265f42fbb1d29cbc9d8805821149101c1d23 Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Mon, 4 Nov 2024 09:38:38 -0300 Subject: [PATCH 02/12] hw/riscv/riscv-iommu: change 'depth' to int Coverity reports an unsigned overflow when doing: for (; depth-- > 0; ) { When depth = 0 inside riscv_iommu_ctx_fetch(). Building it with a recent GCC the code doesn't actually break with depth = 0, i.e. the comparison "0-- > 0" will exit the loop instead of proceeding, but 'depth' will retain the overflow value afterwards. This behavior can be compiler dependent, so change 'depth' to int to remove this potential ambiguity. Resolves: Coverity CID 1564783 Fixes: 0c54acb8243 ("hw/riscv: add RISC-V IOMMU base emulation") Signed-off-by: Daniel Henrique Barboza Reviewed-by: Alistair Francis Message-ID: <20241104123839.533442-2-dbarboza@ventanamicro.com> Signed-off-by: Alistair Francis --- hw/riscv/riscv-iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/riscv/riscv-iommu.c b/hw/riscv/riscv-iommu.c index 12f01a75f5..164a7160fd 100644 --- a/hw/riscv/riscv-iommu.c +++ b/hw/riscv/riscv-iommu.c @@ -863,7 +863,7 @@ static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx) /* Device Context format: 0: extended (64 bytes) | 1: base (32 bytes) */ const int dc_fmt = !s->enable_msi; const size_t dc_len = sizeof(dc) >> dc_fmt; - unsigned depth; + int depth; uint64_t de; switch (mode) { From d3b96a53190dc52d436c39b03fb7533fef044869 Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Mon, 4 Nov 2024 09:38:39 -0300 Subject: [PATCH 03/12] hw/riscv/riscv-iommu: fix riscv_iommu_validate_process_ctx() check 'mode' will never be RISCV_IOMMU_CAP_SV32. We are erroring out in the 'switch' right before it if 'mode' isn't 0, 8, 9 or 10. 'mode' should be check with RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV32. Reported by Coverity via a "DEADCODE" ticket. Resolves: Coverity CID 1564781 Fixes: 0c54acb8243 ("hw/riscv: add RISC-V IOMMU base emulation") Signed-off-by: Daniel Henrique Barboza Reviewed-by: Alistair Francis Message-ID: <20241104123839.533442-3-dbarboza@ventanamicro.com> Signed-off-by: Alistair Francis --- hw/riscv/riscv-iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/riscv/riscv-iommu.c b/hw/riscv/riscv-iommu.c index 164a7160fd..bbc95425b3 100644 --- a/hw/riscv/riscv-iommu.c +++ b/hw/riscv/riscv-iommu.c @@ -820,7 +820,7 @@ static bool riscv_iommu_validate_process_ctx(RISCVIOMMUState *s, } if (ctx->tc & RISCV_IOMMU_DC_TC_SXL) { - if (mode == RISCV_IOMMU_CAP_SV32 && + if (mode == RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV32 && !(s->cap & RISCV_IOMMU_CAP_SV32)) { return false; } From b48381b1ee55053dfad6f5c10ca277bef29ee7c5 Mon Sep 17 00:00:00 2001 From: Max Chou Date: Thu, 19 Sep 2024 01:14:06 +0800 Subject: [PATCH 04/12] target/riscv: Set vdata.vm field for vector load/store whole register instructions The vm field of the vector load/store whole register instruction's encoding is 1. The helper function of the vector load/store whole register instructions may need the vdata.vm field to do some optimizations. Signed-off-by: Max Chou Reviewed-by: Daniel Henrique Barboza Message-ID: <20240918171412.150107-2-max.chou@sifive.com> Signed-off-by: Alistair Francis --- target/riscv/insn_trans/trans_rvv.c.inc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/target/riscv/insn_trans/trans_rvv.c.inc b/target/riscv/insn_trans/trans_rvv.c.inc index f8928c44a8..b9883a5d32 100644 --- a/target/riscv/insn_trans/trans_rvv.c.inc +++ b/target/riscv/insn_trans/trans_rvv.c.inc @@ -770,6 +770,7 @@ static bool ld_us_mask_op(DisasContext *s, arg_vlm_v *a, uint8_t eew) /* Mask destination register are always tail-agnostic */ data = FIELD_DP32(data, VDATA, VTA, s->cfg_vta_all_1s); data = FIELD_DP32(data, VDATA, VMA, s->vma); + data = FIELD_DP32(data, VDATA, VM, 1); return ldst_us_trans(a->rd, a->rs1, data, fn, s, false); } @@ -787,6 +788,7 @@ static bool st_us_mask_op(DisasContext *s, arg_vsm_v *a, uint8_t eew) /* EMUL = 1, NFIELDS = 1 */ data = FIELD_DP32(data, VDATA, LMUL, 0); data = FIELD_DP32(data, VDATA, NF, 1); + data = FIELD_DP32(data, VDATA, VM, 1); return ldst_us_trans(a->rd, a->rs1, data, fn, s, true); } @@ -1106,6 +1108,7 @@ static bool ldst_whole_trans(uint32_t vd, uint32_t rs1, uint32_t nf, TCGv_i32 desc; uint32_t data = FIELD_DP32(0, VDATA, NF, nf); + data = FIELD_DP32(data, VDATA, VM, 1); dest = tcg_temp_new_ptr(); desc = tcg_constant_i32(simd_desc(s->cfg_ptr->vlenb, s->cfg_ptr->vlenb, data)); From 2f077842f2b356a086f70e54c831be53c6f0e101 Mon Sep 17 00:00:00 2001 From: Max Chou Date: Thu, 19 Sep 2024 01:14:07 +0800 Subject: [PATCH 05/12] target/riscv: rvv: Replace VSTART_CHECK_EARLY_EXIT in vext_ldst_us Because the real vl (evl) of vext_ldst_us may be different (e.g. vlm.v/vsm.v/etc.), so the VSTART_CHECK_EARLY_EXIT checking function should be replaced by checking evl in vext_ldst_us. Signed-off-by: Max Chou Reviewed-by: Daniel Henrique Barboza Message-ID: <20240918171412.150107-3-max.chou@sifive.com> Signed-off-by: Alistair Francis --- target/riscv/vector_helper.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c index ccb32e6122..93cac23a13 100644 --- a/target/riscv/vector_helper.c +++ b/target/riscv/vector_helper.c @@ -277,7 +277,10 @@ vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc, uint32_t max_elems = vext_max_elems(desc, log2_esz); uint32_t esz = 1 << log2_esz; - VSTART_CHECK_EARLY_EXIT(env); + if (env->vstart >= evl) { + env->vstart = 0; + return; + } /* load bytes from guest memory */ for (i = env->vstart; i < evl; env->vstart = ++i) { From 338aa15d50b37fa797677d96c091aa81a383e2a1 Mon Sep 17 00:00:00 2001 From: Max Chou Date: Thu, 19 Sep 2024 01:14:08 +0800 Subject: [PATCH 06/12] target/riscv: rvv: Provide a fast path using direct access to host ram for unmasked unit-stride load/store This commit references the sve_ldN_r/sve_stN_r helper functions in ARM target to optimize the vector unmasked unit-stride load/store implementation with following optimizations: * Get the page boundary * Probing pages/resolving host memory address at the beginning if possible * Provide new interface to direct access host memory * Switch to the original slow TLB access when cross page element/violate page permission/violate pmp/watchpoints in page The original element load/store interface is replaced by the new element load/store functions with _tlb & _host postfix that means doing the element load/store through the original softmmu flow and the direct access host memory flow. Signed-off-by: Max Chou Reviewed-by: Daniel Henrique Barboza Message-ID: <20240918171412.150107-4-max.chou@sifive.com> Signed-off-by: Alistair Francis --- target/riscv/vector_helper.c | 363 +++++++++++++++++++++-------------- 1 file changed, 224 insertions(+), 139 deletions(-) diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c index 93cac23a13..304e9951ed 100644 --- a/target/riscv/vector_helper.c +++ b/target/riscv/vector_helper.c @@ -148,34 +148,47 @@ static inline void vext_set_elem_mask(void *v0, int index, } /* elements operations for load and store */ -typedef void vext_ldst_elem_fn(CPURISCVState *env, abi_ptr addr, - uint32_t idx, void *vd, uintptr_t retaddr); +typedef void vext_ldst_elem_fn_tlb(CPURISCVState *env, abi_ptr addr, + uint32_t idx, void *vd, uintptr_t retaddr); +typedef void vext_ldst_elem_fn_host(void *vd, uint32_t idx, void *host); -#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF) \ -static void NAME(CPURISCVState *env, abi_ptr addr, \ - uint32_t idx, void *vd, uintptr_t retaddr)\ -{ \ - ETYPE *cur = ((ETYPE *)vd + H(idx)); \ - *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr); \ -} \ - -GEN_VEXT_LD_ELEM(lde_b, int8_t, H1, ldsb) -GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw) -GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl) -GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq) - -#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF) \ -static void NAME(CPURISCVState *env, abi_ptr addr, \ - uint32_t idx, void *vd, uintptr_t retaddr)\ -{ \ - ETYPE data = *((ETYPE *)vd + H(idx)); \ - cpu_##STSUF##_data_ra(env, addr, data, retaddr); \ +#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF) \ +static void NAME##_tlb(CPURISCVState *env, abi_ptr addr, \ + uint32_t idx, void *vd, uintptr_t retaddr) \ +{ \ + ETYPE *cur = ((ETYPE *)vd + H(idx)); \ + *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr); \ +} \ + \ +static void NAME##_host(void *vd, uint32_t idx, void *host) \ +{ \ + ETYPE *cur = ((ETYPE *)vd + H(idx)); \ + *cur = (ETYPE)LDSUF##_p(host); \ +} + +GEN_VEXT_LD_ELEM(lde_b, uint8_t, H1, ldub) +GEN_VEXT_LD_ELEM(lde_h, uint16_t, H2, lduw) +GEN_VEXT_LD_ELEM(lde_w, uint32_t, H4, ldl) +GEN_VEXT_LD_ELEM(lde_d, uint64_t, H8, ldq) + +#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF) \ +static void NAME##_tlb(CPURISCVState *env, abi_ptr addr, \ + uint32_t idx, void *vd, uintptr_t retaddr) \ +{ \ + ETYPE data = *((ETYPE *)vd + H(idx)); \ + cpu_##STSUF##_data_ra(env, addr, data, retaddr); \ +} \ + \ +static void NAME##_host(void *vd, uint32_t idx, void *host) \ +{ \ + ETYPE data = *((ETYPE *)vd + H(idx)); \ + STSUF##_p(host, data); \ } -GEN_VEXT_ST_ELEM(ste_b, int8_t, H1, stb) -GEN_VEXT_ST_ELEM(ste_h, int16_t, H2, stw) -GEN_VEXT_ST_ELEM(ste_w, int32_t, H4, stl) -GEN_VEXT_ST_ELEM(ste_d, int64_t, H8, stq) +GEN_VEXT_ST_ELEM(ste_b, uint8_t, H1, stb) +GEN_VEXT_ST_ELEM(ste_h, uint16_t, H2, stw) +GEN_VEXT_ST_ELEM(ste_w, uint32_t, H4, stl) +GEN_VEXT_ST_ELEM(ste_d, uint64_t, H8, stq) static void vext_set_tail_elems_1s(target_ulong vl, void *vd, uint32_t desc, uint32_t nf, @@ -198,11 +211,10 @@ static void vext_set_tail_elems_1s(target_ulong vl, void *vd, * stride: access vector element from strided memory */ static void -vext_ldst_stride(void *vd, void *v0, target_ulong base, - target_ulong stride, CPURISCVState *env, - uint32_t desc, uint32_t vm, - vext_ldst_elem_fn *ldst_elem, - uint32_t log2_esz, uintptr_t ra) +vext_ldst_stride(void *vd, void *v0, target_ulong base, target_ulong stride, + CPURISCVState *env, uint32_t desc, uint32_t vm, + vext_ldst_elem_fn_tlb *ldst_elem, uint32_t log2_esz, + uintptr_t ra) { uint32_t i, k; uint32_t nf = vext_nf(desc); @@ -242,10 +254,10 @@ void HELPER(NAME)(void *vd, void * v0, target_ulong base, \ ctzl(sizeof(ETYPE)), GETPC()); \ } -GEN_VEXT_LD_STRIDE(vlse8_v, int8_t, lde_b) -GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h) -GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w) -GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d) +GEN_VEXT_LD_STRIDE(vlse8_v, int8_t, lde_b_tlb) +GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h_tlb) +GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w_tlb) +GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d_tlb) #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN) \ void HELPER(NAME)(void *vd, void *v0, target_ulong base, \ @@ -257,42 +269,114 @@ void HELPER(NAME)(void *vd, void *v0, target_ulong base, \ ctzl(sizeof(ETYPE)), GETPC()); \ } -GEN_VEXT_ST_STRIDE(vsse8_v, int8_t, ste_b) -GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h) -GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w) -GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d) +GEN_VEXT_ST_STRIDE(vsse8_v, int8_t, ste_b_tlb) +GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h_tlb) +GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w_tlb) +GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d_tlb) /* * unit-stride: access elements stored contiguously in memory */ /* unmasked unit-stride load and store operation */ +static void +vext_page_ldst_us(CPURISCVState *env, void *vd, target_ulong addr, + uint32_t elems, uint32_t nf, uint32_t max_elems, + uint32_t log2_esz, bool is_load, int mmu_index, + vext_ldst_elem_fn_tlb *ldst_tlb, + vext_ldst_elem_fn_host *ldst_host, uintptr_t ra) +{ + void *host; + int i, k, flags; + uint32_t esz = 1 << log2_esz; + uint32_t size = (elems * nf) << log2_esz; + uint32_t evl = env->vstart + elems; + MMUAccessType access_type = is_load ? MMU_DATA_LOAD : MMU_DATA_STORE; + + /* Check page permission/pmp/watchpoint/etc. */ + flags = probe_access_flags(env, adjust_addr(env, addr), size, access_type, + mmu_index, true, &host, ra); + + if (flags == 0) { + for (i = env->vstart; i < evl; ++i) { + k = 0; + while (k < nf) { + ldst_host(vd, i + k * max_elems, host); + host += esz; + k++; + } + } + env->vstart += elems; + } else { + /* load bytes from guest memory */ + for (i = env->vstart; i < evl; env->vstart = ++i) { + k = 0; + while (k < nf) { + ldst_tlb(env, adjust_addr(env, addr), i + k * max_elems, vd, + ra); + addr += esz; + k++; + } + } + } +} + static void vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc, - vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl, - uintptr_t ra) + vext_ldst_elem_fn_tlb *ldst_tlb, + vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz, + uint32_t evl, uintptr_t ra, bool is_load) { - uint32_t i, k; + uint32_t k; + target_ulong page_split, elems, addr; uint32_t nf = vext_nf(desc); uint32_t max_elems = vext_max_elems(desc, log2_esz); uint32_t esz = 1 << log2_esz; + uint32_t msize = nf * esz; + int mmu_index = riscv_env_mmu_index(env, false); if (env->vstart >= evl) { env->vstart = 0; return; } - /* load bytes from guest memory */ - for (i = env->vstart; i < evl; env->vstart = ++i) { - k = 0; - while (k < nf) { - target_ulong addr = base + ((i * nf + k) << log2_esz); - ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra); - k++; + /* Calculate the page range of first page */ + addr = base + ((env->vstart * nf) << log2_esz); + page_split = -(addr | TARGET_PAGE_MASK); + /* Get number of elements */ + elems = page_split / msize; + if (unlikely(env->vstart + elems >= evl)) { + elems = evl - env->vstart; + } + + /* Load/store elements in the first page */ + if (likely(elems)) { + vext_page_ldst_us(env, vd, addr, elems, nf, max_elems, log2_esz, + is_load, mmu_index, ldst_tlb, ldst_host, ra); + } + + /* Load/store elements in the second page */ + if (unlikely(env->vstart < evl)) { + /* Cross page element */ + if (unlikely(page_split % msize)) { + for (k = 0; k < nf; k++) { + addr = base + ((env->vstart * nf + k) << log2_esz); + ldst_tlb(env, adjust_addr(env, addr), + env->vstart + k * max_elems, vd, ra); + } + env->vstart++; } + + addr = base + ((env->vstart * nf) << log2_esz); + /* Get number of elements of second page */ + elems = evl - env->vstart; + + /* Load/store elements in the second page */ + vext_page_ldst_us(env, vd, addr, elems, nf, max_elems, log2_esz, + is_load, mmu_index, ldst_tlb, ldst_host, ra); } - env->vstart = 0; + env->vstart = 0; vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems); } @@ -301,47 +385,47 @@ vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc, * stride, stride = NF * sizeof (ETYPE) */ -#define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN) \ -void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base, \ - CPURISCVState *env, uint32_t desc) \ -{ \ - uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE)); \ - vext_ldst_stride(vd, v0, base, stride, env, desc, false, LOAD_FN, \ - ctzl(sizeof(ETYPE)), GETPC()); \ -} \ - \ -void HELPER(NAME)(void *vd, void *v0, target_ulong base, \ - CPURISCVState *env, uint32_t desc) \ -{ \ - vext_ldst_us(vd, base, env, desc, LOAD_FN, \ - ctzl(sizeof(ETYPE)), env->vl, GETPC()); \ +#define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST) \ +void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base, \ + CPURISCVState *env, uint32_t desc) \ +{ \ + uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE)); \ + vext_ldst_stride(vd, v0, base, stride, env, desc, false, \ + LOAD_FN_TLB, ctzl(sizeof(ETYPE)), GETPC()); \ +} \ + \ +void HELPER(NAME)(void *vd, void *v0, target_ulong base, \ + CPURISCVState *env, uint32_t desc) \ +{ \ + vext_ldst_us(vd, base, env, desc, LOAD_FN_TLB, LOAD_FN_HOST, \ + ctzl(sizeof(ETYPE)), env->vl, GETPC(), true); \ } -GEN_VEXT_LD_US(vle8_v, int8_t, lde_b) -GEN_VEXT_LD_US(vle16_v, int16_t, lde_h) -GEN_VEXT_LD_US(vle32_v, int32_t, lde_w) -GEN_VEXT_LD_US(vle64_v, int64_t, lde_d) +GEN_VEXT_LD_US(vle8_v, int8_t, lde_b_tlb, lde_b_host) +GEN_VEXT_LD_US(vle16_v, int16_t, lde_h_tlb, lde_h_host) +GEN_VEXT_LD_US(vle32_v, int32_t, lde_w_tlb, lde_w_host) +GEN_VEXT_LD_US(vle64_v, int64_t, lde_d_tlb, lde_d_host) -#define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN) \ +#define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN_TLB, STORE_FN_HOST) \ void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base, \ CPURISCVState *env, uint32_t desc) \ { \ uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE)); \ - vext_ldst_stride(vd, v0, base, stride, env, desc, false, STORE_FN, \ - ctzl(sizeof(ETYPE)), GETPC()); \ + vext_ldst_stride(vd, v0, base, stride, env, desc, false, \ + STORE_FN_TLB, ctzl(sizeof(ETYPE)), GETPC()); \ } \ \ void HELPER(NAME)(void *vd, void *v0, target_ulong base, \ CPURISCVState *env, uint32_t desc) \ { \ - vext_ldst_us(vd, base, env, desc, STORE_FN, \ - ctzl(sizeof(ETYPE)), env->vl, GETPC()); \ + vext_ldst_us(vd, base, env, desc, STORE_FN_TLB, STORE_FN_HOST, \ + ctzl(sizeof(ETYPE)), env->vl, GETPC(), false); \ } -GEN_VEXT_ST_US(vse8_v, int8_t, ste_b) -GEN_VEXT_ST_US(vse16_v, int16_t, ste_h) -GEN_VEXT_ST_US(vse32_v, int32_t, ste_w) -GEN_VEXT_ST_US(vse64_v, int64_t, ste_d) +GEN_VEXT_ST_US(vse8_v, int8_t, ste_b_tlb, ste_b_host) +GEN_VEXT_ST_US(vse16_v, int16_t, ste_h_tlb, ste_h_host) +GEN_VEXT_ST_US(vse32_v, int32_t, ste_w_tlb, ste_w_host) +GEN_VEXT_ST_US(vse64_v, int64_t, ste_d_tlb, ste_d_host) /* * unit stride mask load and store, EEW = 1 @@ -351,8 +435,8 @@ void HELPER(vlm_v)(void *vd, void *v0, target_ulong base, { /* evl = ceil(vl/8) */ uint8_t evl = (env->vl + 7) >> 3; - vext_ldst_us(vd, base, env, desc, lde_b, - 0, evl, GETPC()); + vext_ldst_us(vd, base, env, desc, lde_b_tlb, lde_b_host, + 0, evl, GETPC(), true); } void HELPER(vsm_v)(void *vd, void *v0, target_ulong base, @@ -360,8 +444,8 @@ void HELPER(vsm_v)(void *vd, void *v0, target_ulong base, { /* evl = ceil(vl/8) */ uint8_t evl = (env->vl + 7) >> 3; - vext_ldst_us(vd, base, env, desc, ste_b, - 0, evl, GETPC()); + vext_ldst_us(vd, base, env, desc, ste_b_tlb, ste_b_host, + 0, evl, GETPC(), false); } /* @@ -386,7 +470,7 @@ static inline void vext_ldst_index(void *vd, void *v0, target_ulong base, void *vs2, CPURISCVState *env, uint32_t desc, vext_get_index_addr get_index_addr, - vext_ldst_elem_fn *ldst_elem, + vext_ldst_elem_fn_tlb *ldst_elem, uint32_t log2_esz, uintptr_t ra) { uint32_t i, k; @@ -427,22 +511,22 @@ void HELPER(NAME)(void *vd, void *v0, target_ulong base, \ LOAD_FN, ctzl(sizeof(ETYPE)), GETPC()); \ } -GEN_VEXT_LD_INDEX(vlxei8_8_v, int8_t, idx_b, lde_b) -GEN_VEXT_LD_INDEX(vlxei8_16_v, int16_t, idx_b, lde_h) -GEN_VEXT_LD_INDEX(vlxei8_32_v, int32_t, idx_b, lde_w) -GEN_VEXT_LD_INDEX(vlxei8_64_v, int64_t, idx_b, lde_d) -GEN_VEXT_LD_INDEX(vlxei16_8_v, int8_t, idx_h, lde_b) -GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h) -GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w) -GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d) -GEN_VEXT_LD_INDEX(vlxei32_8_v, int8_t, idx_w, lde_b) -GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h) -GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w) -GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d) -GEN_VEXT_LD_INDEX(vlxei64_8_v, int8_t, idx_d, lde_b) -GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h) -GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w) -GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d) +GEN_VEXT_LD_INDEX(vlxei8_8_v, int8_t, idx_b, lde_b_tlb) +GEN_VEXT_LD_INDEX(vlxei8_16_v, int16_t, idx_b, lde_h_tlb) +GEN_VEXT_LD_INDEX(vlxei8_32_v, int32_t, idx_b, lde_w_tlb) +GEN_VEXT_LD_INDEX(vlxei8_64_v, int64_t, idx_b, lde_d_tlb) +GEN_VEXT_LD_INDEX(vlxei16_8_v, int8_t, idx_h, lde_b_tlb) +GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h_tlb) +GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w_tlb) +GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d_tlb) +GEN_VEXT_LD_INDEX(vlxei32_8_v, int8_t, idx_w, lde_b_tlb) +GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h_tlb) +GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w_tlb) +GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d_tlb) +GEN_VEXT_LD_INDEX(vlxei64_8_v, int8_t, idx_d, lde_b_tlb) +GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h_tlb) +GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w_tlb) +GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d_tlb) #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN) \ void HELPER(NAME)(void *vd, void *v0, target_ulong base, \ @@ -453,22 +537,22 @@ void HELPER(NAME)(void *vd, void *v0, target_ulong base, \ GETPC()); \ } -GEN_VEXT_ST_INDEX(vsxei8_8_v, int8_t, idx_b, ste_b) -GEN_VEXT_ST_INDEX(vsxei8_16_v, int16_t, idx_b, ste_h) -GEN_VEXT_ST_INDEX(vsxei8_32_v, int32_t, idx_b, ste_w) -GEN_VEXT_ST_INDEX(vsxei8_64_v, int64_t, idx_b, ste_d) -GEN_VEXT_ST_INDEX(vsxei16_8_v, int8_t, idx_h, ste_b) -GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h) -GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w) -GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d) -GEN_VEXT_ST_INDEX(vsxei32_8_v, int8_t, idx_w, ste_b) -GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h) -GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w) -GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d) -GEN_VEXT_ST_INDEX(vsxei64_8_v, int8_t, idx_d, ste_b) -GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h) -GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w) -GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d) +GEN_VEXT_ST_INDEX(vsxei8_8_v, int8_t, idx_b, ste_b_tlb) +GEN_VEXT_ST_INDEX(vsxei8_16_v, int16_t, idx_b, ste_h_tlb) +GEN_VEXT_ST_INDEX(vsxei8_32_v, int32_t, idx_b, ste_w_tlb) +GEN_VEXT_ST_INDEX(vsxei8_64_v, int64_t, idx_b, ste_d_tlb) +GEN_VEXT_ST_INDEX(vsxei16_8_v, int8_t, idx_h, ste_b_tlb) +GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h_tlb) +GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w_tlb) +GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d_tlb) +GEN_VEXT_ST_INDEX(vsxei32_8_v, int8_t, idx_w, ste_b_tlb) +GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h_tlb) +GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w_tlb) +GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d_tlb) +GEN_VEXT_ST_INDEX(vsxei64_8_v, int8_t, idx_d, ste_b_tlb) +GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h_tlb) +GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w_tlb) +GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d_tlb) /* * unit-stride fault-only-fisrt load instructions @@ -476,7 +560,7 @@ GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d) static inline void vext_ldff(void *vd, void *v0, target_ulong base, CPURISCVState *env, uint32_t desc, - vext_ldst_elem_fn *ldst_elem, + vext_ldst_elem_fn_tlb *ldst_elem, uint32_t log2_esz, uintptr_t ra) { uint32_t i, k, vl = 0; @@ -562,10 +646,10 @@ void HELPER(NAME)(void *vd, void *v0, target_ulong base, \ ctzl(sizeof(ETYPE)), GETPC()); \ } -GEN_VEXT_LDFF(vle8ff_v, int8_t, lde_b) -GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h) -GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w) -GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d) +GEN_VEXT_LDFF(vle8ff_v, int8_t, lde_b_tlb) +GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h_tlb) +GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w_tlb) +GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d_tlb) #define DO_SWAP(N, M) (M) #define DO_AND(N, M) (N & M) @@ -582,7 +666,8 @@ GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d) */ static void vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc, - vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uintptr_t ra) + vext_ldst_elem_fn_tlb *ldst_elem, uint32_t log2_esz, + uintptr_t ra) { uint32_t i, k, off, pos; uint32_t nf = vext_nf(desc); @@ -626,22 +711,22 @@ void HELPER(NAME)(void *vd, target_ulong base, \ ctzl(sizeof(ETYPE)), GETPC()); \ } -GEN_VEXT_LD_WHOLE(vl1re8_v, int8_t, lde_b) -GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h) -GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w) -GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d) -GEN_VEXT_LD_WHOLE(vl2re8_v, int8_t, lde_b) -GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h) -GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w) -GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d) -GEN_VEXT_LD_WHOLE(vl4re8_v, int8_t, lde_b) -GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h) -GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w) -GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d) -GEN_VEXT_LD_WHOLE(vl8re8_v, int8_t, lde_b) -GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h) -GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w) -GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d) +GEN_VEXT_LD_WHOLE(vl1re8_v, int8_t, lde_b_tlb) +GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h_tlb) +GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w_tlb) +GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d_tlb) +GEN_VEXT_LD_WHOLE(vl2re8_v, int8_t, lde_b_tlb) +GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h_tlb) +GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w_tlb) +GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d_tlb) +GEN_VEXT_LD_WHOLE(vl4re8_v, int8_t, lde_b_tlb) +GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h_tlb) +GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w_tlb) +GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d_tlb) +GEN_VEXT_LD_WHOLE(vl8re8_v, int8_t, lde_b_tlb) +GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h_tlb) +GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w_tlb) +GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d_tlb) #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN) \ void HELPER(NAME)(void *vd, target_ulong base, \ @@ -651,10 +736,10 @@ void HELPER(NAME)(void *vd, target_ulong base, \ ctzl(sizeof(ETYPE)), GETPC()); \ } -GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b) -GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b) -GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b) -GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b) +GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b_tlb) +GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b_tlb) +GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b_tlb) +GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b_tlb) /* * Vector Integer Arithmetic Instructions From 3333000f693e31fd9c5bf3e50f21c90b8ca1b512 Mon Sep 17 00:00:00 2001 From: Max Chou Date: Thu, 19 Sep 2024 01:14:09 +0800 Subject: [PATCH 07/12] target/riscv: rvv: Provide a fast path using direct access to host ram for unit-stride whole register load/store The vector unit-stride whole register load/store instructions are similar to unmasked unit-stride load/store instructions that is suitable to be optimized by using a direct access to host ram fast path. Because the vector whole register load/store instructions do not need to handle the tail agnostic, so remove the vstart early exit checking. Signed-off-by: Max Chou Reviewed-by: Daniel Henrique Barboza Message-ID: <20240918171412.150107-5-max.chou@sifive.com> Signed-off-by: Alistair Francis --- target/riscv/vector_helper.c | 129 +++++++++++++++++++---------------- 1 file changed, 70 insertions(+), 59 deletions(-) diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c index 304e9951ed..ddcd1600e8 100644 --- a/target/riscv/vector_helper.c +++ b/target/riscv/vector_helper.c @@ -666,80 +666,91 @@ GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d_tlb) */ static void vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc, - vext_ldst_elem_fn_tlb *ldst_elem, uint32_t log2_esz, - uintptr_t ra) + vext_ldst_elem_fn_tlb *ldst_tlb, + vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz, + uintptr_t ra, bool is_load) { - uint32_t i, k, off, pos; + target_ulong page_split, elems, addr; uint32_t nf = vext_nf(desc); uint32_t vlenb = riscv_cpu_cfg(env)->vlenb; uint32_t max_elems = vlenb >> log2_esz; + uint32_t evl = nf * max_elems; + uint32_t esz = 1 << log2_esz; + int mmu_index = riscv_env_mmu_index(env, false); - if (env->vstart >= ((vlenb * nf) >> log2_esz)) { - env->vstart = 0; - return; + /* Calculate the page range of first page */ + addr = base + (env->vstart << log2_esz); + page_split = -(addr | TARGET_PAGE_MASK); + /* Get number of elements */ + elems = page_split / esz; + if (unlikely(env->vstart + elems >= evl)) { + elems = evl - env->vstart; } - k = env->vstart / max_elems; - off = env->vstart % max_elems; - - if (off) { - /* load/store rest of elements of current segment pointed by vstart */ - for (pos = off; pos < max_elems; pos++, env->vstart++) { - target_ulong addr = base + ((pos + k * max_elems) << log2_esz); - ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd, - ra); - } - k++; + /* Load/store elements in the first page */ + if (likely(elems)) { + vext_page_ldst_us(env, vd, addr, elems, 1, max_elems, log2_esz, + is_load, mmu_index, ldst_tlb, ldst_host, ra); } - /* load/store elements for rest of segments */ - for (; k < nf; k++) { - for (i = 0; i < max_elems; i++, env->vstart++) { - target_ulong addr = base + ((i + k * max_elems) << log2_esz); - ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra); + /* Load/store elements in the second page */ + if (unlikely(env->vstart < evl)) { + /* Cross page element */ + if (unlikely(page_split % esz)) { + addr = base + (env->vstart << log2_esz); + ldst_tlb(env, adjust_addr(env, addr), env->vstart, vd, ra); + env->vstart++; } + + addr = base + (env->vstart << log2_esz); + /* Get number of elements of second page */ + elems = evl - env->vstart; + + /* Load/store elements in the second page */ + vext_page_ldst_us(env, vd, addr, elems, 1, max_elems, log2_esz, + is_load, mmu_index, ldst_tlb, ldst_host, ra); } env->vstart = 0; } -#define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN) \ -void HELPER(NAME)(void *vd, target_ulong base, \ - CPURISCVState *env, uint32_t desc) \ -{ \ - vext_ldst_whole(vd, base, env, desc, LOAD_FN, \ - ctzl(sizeof(ETYPE)), GETPC()); \ -} - -GEN_VEXT_LD_WHOLE(vl1re8_v, int8_t, lde_b_tlb) -GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h_tlb) -GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w_tlb) -GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d_tlb) -GEN_VEXT_LD_WHOLE(vl2re8_v, int8_t, lde_b_tlb) -GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h_tlb) -GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w_tlb) -GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d_tlb) -GEN_VEXT_LD_WHOLE(vl4re8_v, int8_t, lde_b_tlb) -GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h_tlb) -GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w_tlb) -GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d_tlb) -GEN_VEXT_LD_WHOLE(vl8re8_v, int8_t, lde_b_tlb) -GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h_tlb) -GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w_tlb) -GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d_tlb) - -#define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN) \ -void HELPER(NAME)(void *vd, target_ulong base, \ - CPURISCVState *env, uint32_t desc) \ -{ \ - vext_ldst_whole(vd, base, env, desc, STORE_FN, \ - ctzl(sizeof(ETYPE)), GETPC()); \ -} - -GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b_tlb) -GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b_tlb) -GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b_tlb) -GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b_tlb) +#define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST) \ +void HELPER(NAME)(void *vd, target_ulong base, CPURISCVState *env, \ + uint32_t desc) \ +{ \ + vext_ldst_whole(vd, base, env, desc, LOAD_FN_TLB, LOAD_FN_HOST, \ + ctzl(sizeof(ETYPE)), GETPC(), true); \ +} + +GEN_VEXT_LD_WHOLE(vl1re8_v, int8_t, lde_b_tlb, lde_b_host) +GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h_tlb, lde_h_host) +GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w_tlb, lde_w_host) +GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d_tlb, lde_d_host) +GEN_VEXT_LD_WHOLE(vl2re8_v, int8_t, lde_b_tlb, lde_b_host) +GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h_tlb, lde_h_host) +GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w_tlb, lde_w_host) +GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d_tlb, lde_d_host) +GEN_VEXT_LD_WHOLE(vl4re8_v, int8_t, lde_b_tlb, lde_b_host) +GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h_tlb, lde_h_host) +GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w_tlb, lde_w_host) +GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d_tlb, lde_d_host) +GEN_VEXT_LD_WHOLE(vl8re8_v, int8_t, lde_b_tlb, lde_b_host) +GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h_tlb, lde_h_host) +GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w_tlb, lde_w_host) +GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d_tlb, lde_d_host) + +#define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN_TLB, STORE_FN_HOST) \ +void HELPER(NAME)(void *vd, target_ulong base, CPURISCVState *env, \ + uint32_t desc) \ +{ \ + vext_ldst_whole(vd, base, env, desc, STORE_FN_TLB, STORE_FN_HOST, \ + ctzl(sizeof(ETYPE)), GETPC(), false); \ +} + +GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b_tlb, ste_b_host) +GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b_tlb, ste_b_host) +GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b_tlb, ste_b_host) +GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b_tlb, ste_b_host) /* * Vector Integer Arithmetic Instructions From f00089267df8d6c9b8c8cc92aa0ba22737f6dfd2 Mon Sep 17 00:00:00 2001 From: Max Chou Date: Thu, 19 Sep 2024 01:14:10 +0800 Subject: [PATCH 08/12] target/riscv: rvv: Provide a fast path using direct access to host ram for unit-stride load-only-first load instructions The unmasked unit-stride fault-only-first load instructions are similar to the unmasked unit-stride load/store instructions that is suitable to be optimized by using a direct access to host ram fast path. Signed-off-by: Max Chou Reviewed-by: Daniel Henrique Barboza Message-ID: <20240918171412.150107-6-max.chou@sifive.com> Signed-off-by: Alistair Francis --- target/riscv/vector_helper.c | 98 ++++++++++++++++++++++++++---------- 1 file changed, 71 insertions(+), 27 deletions(-) diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c index ddcd1600e8..d8d476c3ea 100644 --- a/target/riscv/vector_helper.c +++ b/target/riscv/vector_helper.c @@ -558,18 +558,18 @@ GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d_tlb) * unit-stride fault-only-fisrt load instructions */ static inline void -vext_ldff(void *vd, void *v0, target_ulong base, - CPURISCVState *env, uint32_t desc, - vext_ldst_elem_fn_tlb *ldst_elem, - uint32_t log2_esz, uintptr_t ra) +vext_ldff(void *vd, void *v0, target_ulong base, CPURISCVState *env, + uint32_t desc, vext_ldst_elem_fn_tlb *ldst_tlb, + vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz, uintptr_t ra) { uint32_t i, k, vl = 0; uint32_t nf = vext_nf(desc); uint32_t vm = vext_vm(desc); uint32_t max_elems = vext_max_elems(desc, log2_esz); uint32_t esz = 1 << log2_esz; + uint32_t msize = nf * esz; uint32_t vma = vext_vma(desc); - target_ulong addr, offset, remain; + target_ulong addr, offset, remain, page_split, elems; int mmu_index = riscv_env_mmu_index(env, false); VSTART_CHECK_EARLY_EXIT(env); @@ -618,19 +618,63 @@ vext_ldff(void *vd, void *v0, target_ulong base, if (vl != 0) { env->vl = vl; } - for (i = env->vstart; i < env->vl; i++) { - k = 0; - while (k < nf) { - if (!vm && !vext_elem_mask(v0, i)) { - /* set masked-off elements to 1s */ - vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz, - (i + k * max_elems + 1) * esz); - k++; - continue; + + if (env->vstart < env->vl) { + if (vm) { + /* Calculate the page range of first page */ + addr = base + ((env->vstart * nf) << log2_esz); + page_split = -(addr | TARGET_PAGE_MASK); + /* Get number of elements */ + elems = page_split / msize; + if (unlikely(env->vstart + elems >= env->vl)) { + elems = env->vl - env->vstart; + } + + /* Load/store elements in the first page */ + if (likely(elems)) { + vext_page_ldst_us(env, vd, addr, elems, nf, max_elems, + log2_esz, true, mmu_index, ldst_tlb, + ldst_host, ra); + } + + /* Load/store elements in the second page */ + if (unlikely(env->vstart < env->vl)) { + /* Cross page element */ + if (unlikely(page_split % msize)) { + for (k = 0; k < nf; k++) { + addr = base + ((env->vstart * nf + k) << log2_esz); + ldst_tlb(env, adjust_addr(env, addr), + env->vstart + k * max_elems, vd, ra); + } + env->vstart++; + } + + addr = base + ((env->vstart * nf) << log2_esz); + /* Get number of elements of second page */ + elems = env->vl - env->vstart; + + /* Load/store elements in the second page */ + vext_page_ldst_us(env, vd, addr, elems, nf, max_elems, + log2_esz, true, mmu_index, ldst_tlb, + ldst_host, ra); + } + } else { + for (i = env->vstart; i < env->vl; i++) { + k = 0; + while (k < nf) { + if (!vext_elem_mask(v0, i)) { + /* set masked-off elements to 1s */ + vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz, + (i + k * max_elems + 1) * esz); + k++; + continue; + } + addr = base + ((i * nf + k) << log2_esz); + ldst_tlb(env, adjust_addr(env, addr), i + k * max_elems, + vd, ra); + k++; + } } - addr = base + ((i * nf + k) << log2_esz); - ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra); - k++; } } env->vstart = 0; @@ -638,18 +682,18 @@ vext_ldff(void *vd, void *v0, target_ulong base, vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems); } -#define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN) \ -void HELPER(NAME)(void *vd, void *v0, target_ulong base, \ - CPURISCVState *env, uint32_t desc) \ -{ \ - vext_ldff(vd, v0, base, env, desc, LOAD_FN, \ - ctzl(sizeof(ETYPE)), GETPC()); \ +#define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST) \ +void HELPER(NAME)(void *vd, void *v0, target_ulong base, \ + CPURISCVState *env, uint32_t desc) \ +{ \ + vext_ldff(vd, v0, base, env, desc, LOAD_FN_TLB, \ + LOAD_FN_HOST, ctzl(sizeof(ETYPE)), GETPC()); \ } -GEN_VEXT_LDFF(vle8ff_v, int8_t, lde_b_tlb) -GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h_tlb) -GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w_tlb) -GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d_tlb) +GEN_VEXT_LDFF(vle8ff_v, int8_t, lde_b_tlb, lde_b_host) +GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h_tlb, lde_h_host) +GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w_tlb, lde_w_host) +GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d_tlb, lde_d_host) #define DO_SWAP(N, M) (M) #define DO_AND(N, M) (N & M) From e32988789b63d6b09754d4812b87d5bf7ebb37b2 Mon Sep 17 00:00:00 2001 From: Max Chou Date: Thu, 19 Sep 2024 01:14:11 +0800 Subject: [PATCH 09/12] target/riscv: rvv: Provide group continuous ld/st flow for unit-stride ld/st instructions The vector unmasked unit-stride and whole register load/store instructions will load/store continuous memory. If the endian of both the host and guest architecture are the same, then we can group the element load/store to load/store more data at a time. Signed-off-by: Max Chou Reviewed-by: Daniel Henrique Barboza Message-ID: <20240918171412.150107-7-max.chou@sifive.com> Signed-off-by: Alistair Francis --- target/riscv/vector_helper.c | 77 +++++++++++++++++++++++++++++------- 1 file changed, 63 insertions(+), 14 deletions(-) diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c index d8d476c3ea..3d10ff94cd 100644 --- a/target/riscv/vector_helper.c +++ b/target/riscv/vector_helper.c @@ -190,6 +190,45 @@ GEN_VEXT_ST_ELEM(ste_h, uint16_t, H2, stw) GEN_VEXT_ST_ELEM(ste_w, uint32_t, H4, stl) GEN_VEXT_ST_ELEM(ste_d, uint64_t, H8, stq) +static inline QEMU_ALWAYS_INLINE void +vext_continus_ldst_tlb(CPURISCVState *env, vext_ldst_elem_fn_tlb *ldst_tlb, + void *vd, uint32_t evl, target_ulong addr, + uint32_t reg_start, uintptr_t ra, uint32_t esz, + bool is_load) +{ + uint32_t i; + for (i = env->vstart; i < evl; env->vstart = ++i, addr += esz) { + ldst_tlb(env, adjust_addr(env, addr), i, vd, ra); + } +} + +static inline QEMU_ALWAYS_INLINE void +vext_continus_ldst_host(CPURISCVState *env, vext_ldst_elem_fn_host *ldst_host, + void *vd, uint32_t evl, uint32_t reg_start, void *host, + uint32_t esz, bool is_load) +{ +#if HOST_BIG_ENDIAN + for (; reg_start < evl; reg_start++, host += esz) { + ldst_host(vd, reg_start, host); + } +#else + if (esz == 1) { + uint32_t byte_offset = reg_start * esz; + uint32_t size = (evl - reg_start) * esz; + + if (is_load) { + memcpy(vd + byte_offset, host, size); + } else { + memcpy(host, vd + byte_offset, size); + } + } else { + for (; reg_start < evl; reg_start++, host += esz) { + ldst_host(vd, reg_start, host); + } + } +#endif +} + static void vext_set_tail_elems_1s(target_ulong vl, void *vd, uint32_t desc, uint32_t nf, uint32_t esz, uint32_t max_elems) @@ -298,24 +337,34 @@ vext_page_ldst_us(CPURISCVState *env, void *vd, target_ulong addr, mmu_index, true, &host, ra); if (flags == 0) { - for (i = env->vstart; i < evl; ++i) { - k = 0; - while (k < nf) { - ldst_host(vd, i + k * max_elems, host); - host += esz; - k++; + if (nf == 1) { + vext_continus_ldst_host(env, ldst_host, vd, evl, env->vstart, host, + esz, is_load); + } else { + for (i = env->vstart; i < evl; ++i) { + k = 0; + while (k < nf) { + ldst_host(vd, i + k * max_elems, host); + host += esz; + k++; + } } } env->vstart += elems; } else { - /* load bytes from guest memory */ - for (i = env->vstart; i < evl; env->vstart = ++i) { - k = 0; - while (k < nf) { - ldst_tlb(env, adjust_addr(env, addr), i + k * max_elems, vd, - ra); - addr += esz; - k++; + if (nf == 1) { + vext_continus_ldst_tlb(env, ldst_tlb, vd, evl, addr, env->vstart, + ra, esz, is_load); + } else { + /* load bytes from guest memory */ + for (i = env->vstart; i < evl; env->vstart = ++i) { + k = 0; + while (k < nf) { + ldst_tlb(env, adjust_addr(env, addr), i + k * max_elems, + vd, ra); + addr += esz; + k++; + } } } } From f8ee6f533d696b34f74cf4dcf542961776d56e4e Mon Sep 17 00:00:00 2001 From: Max Chou Date: Thu, 19 Sep 2024 01:14:12 +0800 Subject: [PATCH 10/12] target/riscv: Inline unit-stride ld/st and corresponding functions for performance In the vector unit-stride load/store helper functions. the vext_ldst_us & vext_ldst_whole functions corresponding most of the execution time. Inline the functions can avoid the function call overhead to improve the helper function performance. Signed-off-by: Max Chou Reviewed-by: Richard Henderson Reviewed-by: Daniel Henrique Barboza Message-ID: <20240918171412.150107-8-max.chou@sifive.com> Signed-off-by: Alistair Francis --- target/riscv/vector_helper.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c index 3d10ff94cd..a85dd1d200 100644 --- a/target/riscv/vector_helper.c +++ b/target/riscv/vector_helper.c @@ -153,14 +153,16 @@ typedef void vext_ldst_elem_fn_tlb(CPURISCVState *env, abi_ptr addr, typedef void vext_ldst_elem_fn_host(void *vd, uint32_t idx, void *host); #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF) \ -static void NAME##_tlb(CPURISCVState *env, abi_ptr addr, \ +static inline QEMU_ALWAYS_INLINE \ +void NAME##_tlb(CPURISCVState *env, abi_ptr addr, \ uint32_t idx, void *vd, uintptr_t retaddr) \ { \ ETYPE *cur = ((ETYPE *)vd + H(idx)); \ *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr); \ } \ \ -static void NAME##_host(void *vd, uint32_t idx, void *host) \ +static inline QEMU_ALWAYS_INLINE \ +void NAME##_host(void *vd, uint32_t idx, void *host) \ { \ ETYPE *cur = ((ETYPE *)vd + H(idx)); \ *cur = (ETYPE)LDSUF##_p(host); \ @@ -172,14 +174,16 @@ GEN_VEXT_LD_ELEM(lde_w, uint32_t, H4, ldl) GEN_VEXT_LD_ELEM(lde_d, uint64_t, H8, ldq) #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF) \ -static void NAME##_tlb(CPURISCVState *env, abi_ptr addr, \ +static inline QEMU_ALWAYS_INLINE \ +void NAME##_tlb(CPURISCVState *env, abi_ptr addr, \ uint32_t idx, void *vd, uintptr_t retaddr) \ { \ ETYPE data = *((ETYPE *)vd + H(idx)); \ cpu_##STSUF##_data_ra(env, addr, data, retaddr); \ } \ \ -static void NAME##_host(void *vd, uint32_t idx, void *host) \ +static inline QEMU_ALWAYS_INLINE \ +void NAME##_host(void *vd, uint32_t idx, void *host) \ { \ ETYPE data = *((ETYPE *)vd + H(idx)); \ STSUF##_p(host, data); \ @@ -318,7 +322,7 @@ GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d_tlb) */ /* unmasked unit-stride load and store operation */ -static void +static inline QEMU_ALWAYS_INLINE void vext_page_ldst_us(CPURISCVState *env, void *vd, target_ulong addr, uint32_t elems, uint32_t nf, uint32_t max_elems, uint32_t log2_esz, bool is_load, int mmu_index, @@ -370,7 +374,7 @@ vext_page_ldst_us(CPURISCVState *env, void *vd, target_ulong addr, } } -static void +static inline QEMU_ALWAYS_INLINE void vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc, vext_ldst_elem_fn_tlb *ldst_tlb, vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz, @@ -757,7 +761,7 @@ GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d_tlb, lde_d_host) /* * load and store whole register instructions */ -static void +static inline QEMU_ALWAYS_INLINE void vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc, vext_ldst_elem_fn_tlb *ldst_tlb, vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz, From 13d438502b2945f52fd2aa266efb201dc776cb4b Mon Sep 17 00:00:00 2001 From: Quan Zhou Date: Tue, 24 Sep 2024 16:30:01 +0800 Subject: [PATCH 11/12] target/riscv/kvm: Update kvm exts to Linux v6.11 Add support for a few Zc* extensions, Zimop, Zcmop and Zawrs. Signed-off-by: Quan Zhou Reviewed-by: Andrew Jones Reviewed-by: Jim Shu Message-ID: Signed-off-by: Alistair Francis --- target/riscv/kvm/kvm-cpu.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/target/riscv/kvm/kvm-cpu.c b/target/riscv/kvm/kvm-cpu.c index cbda4596da..c53ca1f76b 100644 --- a/target/riscv/kvm/kvm-cpu.c +++ b/target/riscv/kvm/kvm-cpu.c @@ -281,7 +281,10 @@ static KVMCPUConfig kvm_multi_ext_cfgs[] = { KVM_EXT_CFG("zihintntl", ext_zihintntl, KVM_RISCV_ISA_EXT_ZIHINTNTL), KVM_EXT_CFG("zihintpause", ext_zihintpause, KVM_RISCV_ISA_EXT_ZIHINTPAUSE), KVM_EXT_CFG("zihpm", ext_zihpm, KVM_RISCV_ISA_EXT_ZIHPM), + KVM_EXT_CFG("zimop", ext_zimop, KVM_RISCV_ISA_EXT_ZIMOP), + KVM_EXT_CFG("zcmop", ext_zcmop, KVM_RISCV_ISA_EXT_ZCMOP), KVM_EXT_CFG("zacas", ext_zacas, KVM_RISCV_ISA_EXT_ZACAS), + KVM_EXT_CFG("zawrs", ext_zawrs, KVM_RISCV_ISA_EXT_ZAWRS), KVM_EXT_CFG("zfa", ext_zfa, KVM_RISCV_ISA_EXT_ZFA), KVM_EXT_CFG("zfh", ext_zfh, KVM_RISCV_ISA_EXT_ZFH), KVM_EXT_CFG("zfhmin", ext_zfhmin, KVM_RISCV_ISA_EXT_ZFHMIN), @@ -292,6 +295,10 @@ static KVMCPUConfig kvm_multi_ext_cfgs[] = { KVM_EXT_CFG("zbkc", ext_zbkc, KVM_RISCV_ISA_EXT_ZBKC), KVM_EXT_CFG("zbkx", ext_zbkx, KVM_RISCV_ISA_EXT_ZBKX), KVM_EXT_CFG("zbs", ext_zbs, KVM_RISCV_ISA_EXT_ZBS), + KVM_EXT_CFG("zca", ext_zca, KVM_RISCV_ISA_EXT_ZCA), + KVM_EXT_CFG("zcb", ext_zcb, KVM_RISCV_ISA_EXT_ZCB), + KVM_EXT_CFG("zcd", ext_zcd, KVM_RISCV_ISA_EXT_ZCD), + KVM_EXT_CFG("zcf", ext_zcf, KVM_RISCV_ISA_EXT_ZCF), KVM_EXT_CFG("zknd", ext_zknd, KVM_RISCV_ISA_EXT_ZKND), KVM_EXT_CFG("zkne", ext_zkne, KVM_RISCV_ISA_EXT_ZKNE), KVM_EXT_CFG("zknh", ext_zknh, KVM_RISCV_ISA_EXT_ZKNH), From 27652f9ca9d831c67dd447346c6ee953669255f0 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Tue, 5 Nov 2024 11:35:19 +0100 Subject: [PATCH 12/12] tests/functional: Convert the RV32-on-RV64 riscv test A straggler that has been added to the Avocado framework while the conversion to the functional framework was already in progress... Move it over now, too! Signed-off-by: Thomas Huth Reviewed-by: Alistair Francis Reviewed-by: LIU Zhiwei Message-ID: <20241105103519.341304-1-thuth@redhat.com> Signed-off-by: Alistair Francis --- tests/avocado/tuxrun_baselines.py | 16 ---------------- tests/functional/test_riscv64_tuxrun.py | 13 +++++++++++++ 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/tests/avocado/tuxrun_baselines.py b/tests/avocado/tuxrun_baselines.py index 366c262e32..38064840da 100644 --- a/tests/avocado/tuxrun_baselines.py +++ b/tests/avocado/tuxrun_baselines.py @@ -222,19 +222,3 @@ def test_arm64be(self): "rootfs.ext4.zst" : "e6ffd8813c8a335bc15728f2835f90539c84be7f8f5f691a8b01451b47fb4bd7"} self.common_tuxrun(csums=sums) - - def test_riscv64_rv32(self): - """ - :avocado: tags=arch:riscv64 - :avocado: tags=machine:virt - :avocado: tags=tuxboot:riscv32 - :avocado: tags=cpu:rv32 - """ - sums = { "Image" : - "89599407d7334de629a40e7ad6503c73670359eb5f5ae9d686353a3d6deccbd5", - "fw_jump.elf" : - "f2ef28a0b77826f79d085d3e4aa686f1159b315eff9099a37046b18936676985", - "rootfs.ext4.zst" : - "7168d296d0283238ea73cd5a775b3dd608e55e04c7b92b76ecce31bb13108cba" } - - self.common_tuxrun(csums=sums) diff --git a/tests/functional/test_riscv64_tuxrun.py b/tests/functional/test_riscv64_tuxrun.py index 13501628f9..4e2449539c 100755 --- a/tests/functional/test_riscv64_tuxrun.py +++ b/tests/functional/test_riscv64_tuxrun.py @@ -23,6 +23,13 @@ class TuxRunRiscV64Test(TuxRunBaselineTest): 'https://storage.tuxboot.com/20230331/riscv64/rootfs.ext4.zst', 'b18e3a3bdf27be03da0b285e84cb71bf09eca071c3a087b42884b6982ed679eb') + ASSET_RISCV32_KERNEL = Asset( + 'https://storage.tuxboot.com/20230331/riscv32/Image', + '89599407d7334de629a40e7ad6503c73670359eb5f5ae9d686353a3d6deccbd5') + ASSET_RISCV32_ROOTFS = Asset( + 'https://storage.tuxboot.com/20230331/riscv32/rootfs.ext4.zst', + '7168d296d0283238ea73cd5a775b3dd608e55e04c7b92b76ecce31bb13108cba') + def test_riscv64(self): self.set_machine('virt') self.common_tuxrun(kernel_asset=self.ASSET_RISCV64_KERNEL, @@ -34,5 +41,11 @@ def test_riscv64_maxcpu(self): self.common_tuxrun(kernel_asset=self.ASSET_RISCV64_KERNEL, rootfs_asset=self.ASSET_RISCV64_ROOTFS) + def test_riscv64_rv32(self): + self.set_machine('virt') + self.cpu='rv32' + self.common_tuxrun(kernel_asset=self.ASSET_RISCV32_KERNEL, + rootfs_asset=self.ASSET_RISCV32_ROOTFS) + if __name__ == '__main__': TuxRunBaselineTest.main()