From 0502d09889e4055fcd1f322d57de4d20b373290a Mon Sep 17 00:00:00 2001 From: Anzooooo Date: Fri, 29 Nov 2024 13:17:46 +0800 Subject: [PATCH 1/4] feat(LoadPipe): let 128bitReq be accessed at 128-bit aligned granularity --- .../scala/xiangshan/cache/dcache/loadpipe/LoadPipe.scala | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/main/scala/xiangshan/cache/dcache/loadpipe/LoadPipe.scala b/src/main/scala/xiangshan/cache/dcache/loadpipe/LoadPipe.scala index cbd4877b13..e1df398570 100644 --- a/src/main/scala/xiangshan/cache/dcache/loadpipe/LoadPipe.scala +++ b/src/main/scala/xiangshan/cache/dcache/loadpipe/LoadPipe.scala @@ -131,9 +131,10 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer io.tag_read.valid := io.lsu.req.fire && !io.nack val s0_valid = io.lsu.req.fire - val s0_req = io.lsu.req.bits + val s0_req = WireInit(io.lsu.req.bits) + s0_req.vaddr := Mux(io.load128Req, Cat(io.lsu.req.bits.vaddr(io.lsu.req.bits.vaddr.getWidth - 1, 4), 0.U(4.W)), io.lsu.req.bits.vaddr) val s0_fire = s0_valid && s1_ready - val s0_vaddr = s0_req.vaddr + val s0_vaddr = Mux(io.load128Req, Cat(s0_req.vaddr(s0_req.vaddr.getWidth - 1, 4), 0.U(4.W)), s0_req.vaddr) val s0_replayCarry = s0_req.replayCarry val s0_load128Req = io.load128Req val s0_bank_oh_64 = UIntToOH(addr_to_dcache_bank(s0_vaddr)) @@ -179,7 +180,8 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer val s1_load128Req = RegEnable(s0_load128Req, s0_fire) val s1_is_prefetch = s1_req.instrtype === DCACHE_PREFETCH_SOURCE.U // LSU may update the address from io.lsu.s1_paddr, which affects the bank read enable only. - val s1_vaddr = Cat(s1_req.vaddr(VAddrBits - 1, blockOffBits), io.lsu.s1_paddr_dup_lsu(blockOffBits - 1, 0)) + val s1_vaddr_update = Cat(s1_req.vaddr(VAddrBits - 1, blockOffBits), io.lsu.s1_paddr_dup_lsu(blockOffBits - 1, 0)) + val s1_vaddr = Mux(s1_load128Req, Cat(s1_vaddr_update(VAddrBits - 1, 4), 0.U(4.W)), s1_vaddr_update) val s1_bank_oh = RegEnable(s0_bank_oh, s0_fire) val s1_nack = RegNext(io.nack) val s1_fire = s1_valid && s2_ready From 63095dede36b09c4c39c371a7f99b7fceb065f5c Mon Sep 17 00:00:00 2001 From: Anzooooo Date: Thu, 7 Nov 2024 23:23:32 +0800 Subject: [PATCH 2/4] feat(Zicclsm): minor refactoring misalign and support for vector misalign --- .../scala/xiangshan/backend/MemBlock.scala | 74 +++- .../cache/dcache/DCacheWrapper.scala | 3 +- .../scala/xiangshan/cache/mmu/Repeater.scala | 2 +- src/main/scala/xiangshan/mem/MemCommon.scala | 62 +++- .../xiangshan/mem/lsqueue/LSQWrapper.scala | 4 +- .../mem/lsqueue/LoadExceptionBuffer.scala | 7 +- .../mem/lsqueue/LoadMisalignBuffer.scala | 256 +++++++------ .../xiangshan/mem/lsqueue/LoadQueue.scala | 6 +- .../mem/lsqueue/LoadQueueReplay.scala | 9 +- .../mem/lsqueue/StoreMisalignBuffer.scala | 342 ++++++++++-------- .../xiangshan/mem/lsqueue/StoreQueue.scala | 250 +++++++++---- .../mem/lsqueue/VirtualLoadQueue.scala | 3 +- .../xiangshan/mem/pipeline/LoadUnit.scala | 233 ++++++++---- .../xiangshan/mem/pipeline/StoreUnit.scala | 117 +++--- .../xiangshan/mem/vector/VMergeBuffer.scala | 9 +- .../xiangshan/mem/vector/VSegmentUnit.scala | 229 ++++++++++-- .../scala/xiangshan/mem/vector/VSplit.scala | 21 +- .../xiangshan/mem/vector/VecBundle.scala | 9 + 18 files changed, 1085 insertions(+), 551 deletions(-) diff --git a/src/main/scala/xiangshan/backend/MemBlock.scala b/src/main/scala/xiangshan/backend/MemBlock.scala index 67720b0c51..c11c2b9414 100644 --- a/src/main/scala/xiangshan/backend/MemBlock.scala +++ b/src/main/scala/xiangshan/backend/MemBlock.scala @@ -457,14 +457,17 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer) // misalignBuffer will overwrite the source from ldu if it is about to writeback val misalignWritebackOverride = Mux( - loadMisalignBuffer.io.writeBack.valid, - loadMisalignBuffer.io.writeBack.bits, - loadUnits(MisalignWBPort).io.ldout.bits + loadUnits(MisalignWBPort).io.ldout.valid, + loadUnits(MisalignWBPort).io.ldout.bits, + loadMisalignBuffer.io.writeBack.bits ) - ldaExeWbReqs(MisalignWBPort).valid := loadMisalignBuffer.io.writeBack.valid || loadUnits(MisalignWBPort).io.ldout.valid - ldaExeWbReqs(MisalignWBPort).bits := misalignWritebackOverride - loadMisalignBuffer.io.writeBack.ready := ldaExeWbReqs(MisalignWBPort).ready + ldaExeWbReqs(MisalignWBPort).valid := loadMisalignBuffer.io.writeBack.valid || loadUnits(MisalignWBPort).io.ldout.valid + ldaExeWbReqs(MisalignWBPort).bits := misalignWritebackOverride + loadMisalignBuffer.io.writeBack.ready := ldaExeWbReqs(MisalignWBPort).ready && !loadUnits(MisalignWBPort).io.ldout.valid + loadMisalignBuffer.io.loadOutValid := loadUnits(MisalignWBPort).io.ldout.valid + loadMisalignBuffer.io.loadVecOutValid := loadUnits(MisalignWBPort).io.vecldout.valid loadUnits(MisalignWBPort).io.ldout.ready := ldaExeWbReqs(MisalignWBPort).ready + ldaExeWbReqs(MisalignWBPort).bits.isFromLoadUnit := loadUnits(MisalignWBPort).io.ldout.bits.isFromLoadUnit || loadMisalignBuffer.io.writeBack.valid // loadUnit will overwrite the source from uncache if it is about to writeback ldaExeWbReqs(UncacheWBPort) <> loadUnits(UncacheWBPort).io.ldout @@ -805,6 +808,7 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer) dcache.io.lsu.load(0).s0_pc := vSegmentUnit.io.rdcache.s0_pc dcache.io.lsu.load(0).s1_pc := vSegmentUnit.io.rdcache.s1_pc dcache.io.lsu.load(0).s2_pc := vSegmentUnit.io.rdcache.s2_pc + dcache.io.lsu.load(0).is128Req := vSegmentUnit.io.rdcache.is128Req }.otherwise { loadUnits(i).io.dcache.req.ready := dcache.io.lsu.load(i).req.ready @@ -816,6 +820,7 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer) dcache.io.lsu.load(0).s0_pc := loadUnits(0).io.dcache.s0_pc dcache.io.lsu.load(0).s1_pc := loadUnits(0).io.dcache.s1_pc dcache.io.lsu.load(0).s2_pc := loadUnits(0).io.dcache.s2_pc + dcache.io.lsu.load(0).is128Req := loadUnits(0).io.dcache.is128Req } // forward @@ -917,7 +922,7 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer) // connect misalignBuffer loadMisalignBuffer.io.req(i) <> loadUnits(i).io.misalign_buf - if (i == 0) { + if (i == MisalignWBPort) { loadUnits(i).io.misalign_ldin <> loadMisalignBuffer.io.splitLoadReq loadUnits(i).io.misalign_ldout <> loadMisalignBuffer.io.splitLoadResp } else { @@ -1088,7 +1093,7 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer) loadMisalignBuffer.io.rob.pendingPtr := io.ooo_to_mem.lsqio.pendingPtr loadMisalignBuffer.io.rob.pendingPtrNext := io.ooo_to_mem.lsqio.pendingPtrNext - lsq.io.flushFrmMaBuf := loadMisalignBuffer.io.flushLdExpBuff + lsq.io.loadMisalignFull := loadMisalignBuffer.io.loadMisalignFull storeMisalignBuffer.io.redirect <> redirect storeMisalignBuffer.io.rob.lcommit := io.ooo_to_mem.lsqio.lcommit @@ -1258,16 +1263,22 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer) stOut(0).bits := mmioStout.bits mmioStout.ready := true.B } + // vec mmio writeback lsq.io.vecmmioStout.ready := false.B - when (lsq.io.vecmmioStout.valid && !storeUnits(0).io.vecstout.valid) { - stOut(0).valid := true.B - stOut(0).bits := lsq.io.vecmmioStout.bits - lsq.io.vecmmioStout.ready := true.B - } +// when (lsq.io.vecmmioStout.valid && !storeUnits(0).io.vecstout.valid) { +// stOut(0).valid := true.B +// stOut(0).bits := lsq.io.vecmmioStout.bits +// lsq.io.vecmmioStout.ready := true.B +// } +// + // miss align buffer will overwrite stOut(0) - storeMisalignBuffer.io.writeBack.ready := true.B - when (storeMisalignBuffer.io.writeBack.valid) { + val storeMisalignCanWriteBack = !mmioStout.valid && !storeUnits(0).io.stout.valid && !storeUnits(0).io.vecstout.valid + storeMisalignBuffer.io.writeBack.ready := storeMisalignCanWriteBack + storeMisalignBuffer.io.storeOutValid := storeUnits(0).io.stout.valid + storeMisalignBuffer.io.storeVecOutValid := storeUnits(0).io.vecstout.valid + when (storeMisalignBuffer.io.writeBack.valid && storeMisalignCanWriteBack) { stOut(0).valid := true.B stOut(0).bits := storeMisalignBuffer.io.writeBack.bits } @@ -1448,6 +1459,9 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer) (0 until VstuCnt).foreach{i => vsMergeBuffer(i).io.fromPipeline := DontCare vsMergeBuffer(i).io.fromSplit := DontCare + + vsMergeBuffer(i).io.fromMisalignBuffer.get.flush := storeMisalignBuffer.io.toVecStoreMergeBuffer(i).flush + vsMergeBuffer(i).io.fromMisalignBuffer.get.mbIndex := storeMisalignBuffer.io.toVecStoreMergeBuffer(i).mbIndex } (0 until VstuCnt).foreach{i => @@ -1463,6 +1477,9 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer) ) vsSplit(i).io.vstd.get := DontCare // Todo: Discuss how to pass vector store data + vsSplit(i).io.vstdMisalign.get.storeMisalignBufferEmpty := !storeMisalignBuffer.io.full + vsSplit(i).io.vstdMisalign.get.storePipeEmpty := !storeUnits(i).io.s0_s1_valid + } (0 until VlduCnt).foreach{i => vlSplit(i).io.redirect <> redirect @@ -1481,12 +1498,35 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer) vfofBuffer.io.in(i).bits := io.ooo_to_mem.issueVldu(i).bits } (0 until LduCnt).foreach{i=> - vlMergeBuffer.io.fromPipeline(i) <> loadUnits(i).io.vecldout + loadUnits(i).io.vecldout.ready := vlMergeBuffer.io.fromPipeline(i).ready + loadMisalignBuffer.io.vecWriteBack.ready := true.B + + if (i == 1) { + when(loadUnits(i).io.vecldout.valid) { + vlMergeBuffer.io.fromPipeline(i).valid := loadUnits(i).io.vecldout.valid + vlMergeBuffer.io.fromPipeline(i).bits := loadUnits(i).io.vecldout.bits + } .otherwise { + vlMergeBuffer.io.fromPipeline(i).valid := loadMisalignBuffer.io.vecWriteBack.valid + vlMergeBuffer.io.fromPipeline(i).bits := loadMisalignBuffer.io.vecWriteBack.bits + } + } else { + vlMergeBuffer.io.fromPipeline(i).valid := loadUnits(i).io.vecldout.valid + vlMergeBuffer.io.fromPipeline(i).bits := loadUnits(i).io.vecldout.bits + } } (0 until StaCnt).foreach{i=> if(i < VstuCnt){ - vsMergeBuffer(i).io.fromPipeline.head <> storeUnits(i).io.vecstout + storeUnits(i).io.vecstout.ready := true.B + storeMisalignBuffer.io.vecWriteBack(i).ready := vsMergeBuffer(i).io.fromPipeline.head.ready + + when(storeUnits(i).io.vecstout.valid) { + vsMergeBuffer(i).io.fromPipeline.head.valid := storeUnits(i).io.vecstout.valid + vsMergeBuffer(i).io.fromPipeline.head.bits := storeUnits(i).io.vecstout.bits + } .otherwise { + vsMergeBuffer(i).io.fromPipeline.head.valid := storeMisalignBuffer.io.vecWriteBack(i).valid + vsMergeBuffer(i).io.fromPipeline.head.bits := storeMisalignBuffer.io.vecWriteBack(i).bits + } } } diff --git a/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala b/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala index 4a87a11dcc..7a97addb44 100644 --- a/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala +++ b/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala @@ -401,7 +401,8 @@ class DCacheWordReqWithVaddr(implicit p: Parameters) extends DCacheWordReq { class DCacheWordReqWithVaddrAndPfFlag(implicit p: Parameters) extends DCacheWordReqWithVaddr { val prefetch = Bool() val vecValid = Bool() - + val sqNeedDeq = Bool() + def toDCacheWordReqWithVaddr() = { val res = Wire(new DCacheWordReqWithVaddr) res.vaddr := vaddr diff --git a/src/main/scala/xiangshan/cache/mmu/Repeater.scala b/src/main/scala/xiangshan/cache/mmu/Repeater.scala index 98d452f356..c2c37caddd 100644 --- a/src/main/scala/xiangshan/cache/mmu/Repeater.scala +++ b/src/main/scala/xiangshan/cache/mmu/Repeater.scala @@ -552,7 +552,7 @@ class PTWFilter(Width: Int, Size: Int, FenceDelay: Int)(implicit p: Parameters) val issue_valid = v(issPtr) && !isEmptyIss && !inflight_full val issue_filtered = ptwResp_valid && ptwResp_hit(io.ptw.req(0).bits.vpn, io.ptw.req(0).bits.s2xlate, ptwResp) - val issue_fire_fake = issue_valid && (io.ptw.req(0).ready || (issue_filtered && false.B /*timing-opt*/)) + val issue_fire_fake = issue_valid && io.ptw.req(0).ready io.ptw.req(0).valid := issue_valid && !issue_filtered io.ptw.req(0).bits.vpn := vpn(issPtr) io.ptw.req(0).bits.s2xlate := s2xlate(issPtr) diff --git a/src/main/scala/xiangshan/mem/MemCommon.scala b/src/main/scala/xiangshan/mem/MemCommon.scala index 600887672d..aa2109f188 100644 --- a/src/main/scala/xiangshan/mem/MemCommon.scala +++ b/src/main/scala/xiangshan/mem/MemCommon.scala @@ -54,6 +54,28 @@ object genVWmask { } } +object genBasemask { + /** + * + * @param addr + * @param sizeEncode + * @return Return 16-byte aligned mask. + * + * Example: + * Address: 0x80000003 Encoding size: ‘b11 + * Return: 0xff + */ + def apply(addr: UInt, sizeEncode: UInt): UInt = { + LookupTree(sizeEncode, List( + "b00".U -> 0x1.U, + "b01".U -> 0x3.U, + "b10".U -> 0xf.U, + "b11".U -> 0xff.U + )) + } +} + + object genWdata { def apply(data: UInt, sizeEncode: UInt): UInt = { LookupTree(sizeEncode, List( @@ -171,6 +193,13 @@ class LsPipelineBundle(implicit p: Parameters) extends XSBundle val schedIndex = UInt(log2Up(LoadQueueReplaySize).W) // hardware prefetch and fast replay no need to query tlb val tlbNoQuery = Bool() + + // misalign + val isMisalign = Bool() + val isFinalSplit = Bool() + val misalignWith16Byte = Bool() + val misalignNeedWakeUp = Bool() + val updateAddrValid = Bool() } class LdPrefetchTrainBundle(implicit p: Parameters) extends LsPipelineBundle { @@ -412,28 +441,29 @@ class StoreNukeQueryIO(implicit p: Parameters) extends XSBundle { class StoreMaBufToSqControlIO(implicit p: Parameters) extends XSBundle { // from storeMisalignBuffer to storeQueue, control it's sbuffer write - val control = Output(new XSBundle { - // control sq to write-into sb - val writeSb = Bool() - val wdata = UInt(VLEN.W) - val wmask = UInt((VLEN / 8).W) + val toStoreQueue = Output(new XSBundle { + // This entry is a cross page + val crossPageWithHit = Bool() + val crossPageCanDeq = Bool() + // High page Paddr val paddr = UInt(PAddrBits.W) - val vaddr = UInt(VAddrBits.W) - val last = Bool() - val hasException = Bool() - // remove this entry in sq - val removeSq = Bool() + + val withSameUop = Bool() }) // from storeQueue to storeMisalignBuffer, provide detail info of this store - val storeInfo = Input(new XSBundle { - val data = UInt(VLEN.W) - // is the data of the unaligned store ready at sq? - val dataReady = Bool() - // complete a data transfer from sq to sb - val completeSbTrans = Bool() + val toStoreMisalignBuffer = Input(new XSBundle { + val sqPtr = new SqPtr + val doDeq = Bool() + + val uop = new DynInst() }) } +class StoreMaBufToVecStoreMergeBufferIO(implicit p: Parameters) extends VLSUBundle{ + val mbIndex = Output(UInt(vsmBindexBits.W)) + val flush = Output(Bool()) +} + // Store byte valid mask write bundle // // Store byte valid mask write to SQ takes 2 cycles diff --git a/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala b/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala index fa44460947..f8e87d9898 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala @@ -116,7 +116,7 @@ class LsqWrapper(implicit p: Parameters) extends XSModule with HasDCacheParamete val lqDeqPtr = Output(new LqPtr) val sqDeqPtr = Output(new SqPtr) val exceptionAddr = new ExceptionAddrIO - val flushFrmMaBuf = Input(Bool()) + val loadMisalignFull = Input(Bool()) val issuePtrExt = Output(new SqPtr) val l2_hint = Input(Valid(new L2ToL1Hint())) val tlb_hint = Flipped(new TlbHintIO) @@ -208,7 +208,7 @@ class LsqWrapper(implicit p: Parameters) extends XSModule with HasDCacheParamete loadQueue.io.tl_d_channel <> io.tl_d_channel loadQueue.io.release <> io.release loadQueue.io.exceptionAddr.isStore := DontCare - loadQueue.io.flushFrmMaBuf := io.flushFrmMaBuf + loadQueue.io.loadMisalignFull := io.loadMisalignFull loadQueue.io.lqCancelCnt <> io.lqCancelCnt loadQueue.io.sq.stAddrReadySqPtr <> storeQueue.io.stAddrReadySqPtr loadQueue.io.sq.stAddrReadyVec <> storeQueue.io.stAddrReadyVec diff --git a/src/main/scala/xiangshan/mem/lsqueue/LoadExceptionBuffer.scala b/src/main/scala/xiangshan/mem/lsqueue/LoadExceptionBuffer.scala index c7ae0c5159..11cc3a952f 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/LoadExceptionBuffer.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/LoadExceptionBuffer.scala @@ -37,7 +37,6 @@ class LqExceptionBuffer(implicit p: Parameters) extends XSModule with HasCircula val io = IO(new Bundle() { val redirect = Flipped(Valid(new Redirect)) val req = Vec(enqPortNum, Flipped(Valid(new LqWriteBundle))) - val flushFrmMaBuf = Input(Bool()) val exceptionAddr = new ExceptionAddrIO }) @@ -67,7 +66,7 @@ class LqExceptionBuffer(implicit p: Parameters) extends XSModule with HasCircula when (req_valid && req.uop.robIdx.needFlush(io.redirect)) { req_valid := s2_enqueue.asUInt.orR } .elsewhen (s2_enqueue.asUInt.orR) { - req_valid := req_valid || true.B + req_valid := true.B } def selectOldest[T <: LqWriteBundle](valid: Seq[Bool], bits: Seq[T]): (Seq[Bool], Seq[T]) = { @@ -111,10 +110,6 @@ class LqExceptionBuffer(implicit p: Parameters) extends XSModule with HasCircula io.exceptionAddr.gpaddr := req.gpaddr io.exceptionAddr.isForVSnonLeafPTE := req.isForVSnonLeafPTE - when(req_valid && io.flushFrmMaBuf) { - req_valid := false.B - } - XSPerfAccumulate("exception", !RegNext(req_valid) && req_valid) // end diff --git a/src/main/scala/xiangshan/mem/lsqueue/LoadMisalignBuffer.scala b/src/main/scala/xiangshan/mem/lsqueue/LoadMisalignBuffer.scala index 326b1db75c..283386c9ca 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/LoadMisalignBuffer.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/LoadMisalignBuffer.scala @@ -115,11 +115,14 @@ class LoadMisalignBuffer(implicit p: Parameters) extends XSModule val io = IO(new Bundle() { val redirect = Flipped(Valid(new Redirect)) - val req = Vec(enqPortNum, Flipped(Valid(new LqWriteBundle))) + val req = Vec(enqPortNum, Flipped(Decoupled(new LqWriteBundle))) val rob = Flipped(new RobLsqIO) val splitLoadReq = Decoupled(new LsPipelineBundle) val splitLoadResp = Flipped(Valid(new LqWriteBundle)) val writeBack = Decoupled(new MemExuOutput) + val vecWriteBack = Decoupled(new VecPipelineFeedbackIO(isVStore = false)) + val loadOutValid = Input(Bool()) + val loadVecOutValid = Input(Bool()) val overwriteExpBuf = Output(new XSBundle { val valid = Bool() val vaddr = UInt(XLEN.W) @@ -128,6 +131,7 @@ class LoadMisalignBuffer(implicit p: Parameters) extends XSModule val isForVSnonLeafPTE = Bool() }) val flushLdExpBuff = Output(Bool()) + val loadMisalignFull = Output(Bool()) }) io.rob.mmio := 0.U.asTypeOf(Vec(LoadPipelineWidth, Bool())) @@ -136,57 +140,41 @@ class LoadMisalignBuffer(implicit p: Parameters) extends XSModule val req_valid = RegInit(false.B) val req = Reg(new LqWriteBundle) - // enqueue - // s1: - val s1_req = VecInit(io.req.map(_.bits)) - val s1_valid = VecInit(io.req.map(x => x.valid)) - - // s2: delay 1 cycle - val s2_req = RegNext(s1_req) - val s2_valid = (0 until enqPortNum).map(i => - RegNext(s1_valid(i)) && - !s2_req(i).uop.robIdx.needFlush(RegNext(io.redirect)) && - !s2_req(i).uop.robIdx.needFlush(io.redirect) - ) - val s2_miss_aligned = s2_req.map(x => - x.uop.exceptionVec(loadAddrMisaligned) && !x.uop.exceptionVec(breakPoint) && !TriggerAction.isDmode(x.uop.trigger) - ) - - val s2_enqueue = Wire(Vec(enqPortNum, Bool())) - for (w <- 0 until enqPortNum) { - s2_enqueue(w) := s2_valid(w) && s2_miss_aligned(w) - } + io.loadMisalignFull := req_valid - when (req_valid && req.uop.robIdx.needFlush(io.redirect)) { - req_valid := s2_enqueue.asUInt.orR - } .elsewhen (s2_enqueue.asUInt.orR) { - req_valid := req_valid || true.B + (0 until io.req.length).map{i => + if (i == 0) { + io.req(0).ready := !req_valid && io.req(0).valid + } + else { + io.req(i).ready := !io.req.take(i).map(_.ready).reduce(_ || _) && !req_valid && io.req(i).valid + } } - val reqSel = selectOldest(s2_enqueue, s2_req) - when (req_valid) { - req := Mux( - reqSel._1(0) && (isAfter(req.uop.robIdx, reqSel._2(0).uop.robIdx) || (isNotBefore(req.uop.robIdx, reqSel._2(0).uop.robIdx) && req.uop.uopIdx > reqSel._2(0).uop.uopIdx)), - reqSel._2(0), - req) - } .elsewhen (s2_enqueue.asUInt.orR) { - req := reqSel._2(0) + val select_req_bit = ParallelPriorityMux(io.req.map(_.valid), io.req.map(_.bits)) + val select_req_valid = io.req.map(_.valid).reduce(_ || _) + val canEnqValid = !req_valid && !select_req_bit.uop.robIdx.needFlush(io.redirect) && select_req_valid + when(canEnqValid) { + req := select_req_bit + req_valid := true.B } - val robMatch = req_valid && io.rob.pendingld && (io.rob.pendingPtr === req.uop.robIdx) - // buffer control: // - split miss-aligned load into aligned loads // - send split load to ldu and get result from ldu // - merge them and write back to rob - val s_idle :: s_split :: s_req :: s_resp :: s_comb :: s_wb :: s_wait :: Nil = Enum(7) + val s_idle :: s_split :: s_req :: s_resp :: s_comb_wakeup_rep :: s_wb :: Nil = Enum(6) val bufferState = RegInit(s_idle) val splitLoadReqs = RegInit(VecInit(List.fill(maxSplitNum)(0.U.asTypeOf(new LsPipelineBundle)))) val splitLoadResp = RegInit(VecInit(List.fill(maxSplitNum)(0.U.asTypeOf(new LqWriteBundle)))) val exceptionVec = RegInit(0.U.asTypeOf(ExceptionVec())) val unSentLoads = RegInit(0.U(maxSplitNum.W)) val curPtr = RegInit(0.U(log2Ceil(maxSplitNum).W)) + val needWakeUpReqsWire = Wire(Bool()) + val needWakeUpReqsReg = RegInit(false.B) + val needWakeUpWB = RegInit(false.B) + val data_select = RegEnable(genRdataOH(select_req_bit.uop), 0.U(genRdataOH(select_req_bit.uop).getWidth.W), canEnqValid) // if there is exception or mmio in split load val globalException = RegInit(false.B) @@ -194,10 +182,10 @@ class LoadMisalignBuffer(implicit p: Parameters) extends XSModule val hasException = ExceptionNO.selectByFu(io.splitLoadResp.bits.uop.exceptionVec, LduCfg).asUInt.orR val isMMIO = io.splitLoadResp.bits.mmio - + needWakeUpReqsWire := false.B switch(bufferState) { is (s_idle) { - when (robMatch) { + when (req_valid) { bufferState := s_split } } @@ -225,36 +213,72 @@ class LoadMisalignBuffer(implicit p: Parameters) extends XSModule // need replay or still has unsent requests bufferState := s_req } .otherwise { - // merge the split load results - bufferState := s_comb + when(!req.isvec) { + // merge the split load results + bufferState := s_comb_wakeup_rep + needWakeUpReqsWire := true.B + needWakeUpWB := true.B + when (!io.splitLoadReq.fire) { + needWakeUpReqsReg := true.B + } + } .otherwise { + bufferState := s_comb_wakeup_rep + } + } } } - is (s_comb) { - bufferState := s_wb + is (s_comb_wakeup_rep) { + when(!req.isvec) { + when(needWakeUpReqsReg) { + when(io.splitLoadReq.fire) { + bufferState := s_wb + needWakeUpReqsReg := false.B + }.otherwise { + bufferState := s_comb_wakeup_rep + } + needWakeUpReqsWire := true.B + } .otherwise { + bufferState := s_comb_wakeup_rep + } + } .otherwise { + bufferState := s_wb + } + } is (s_wb) { - when(io.writeBack.fire) { - bufferState := s_wait - } - } + when(req.isvec) { + when(io.vecWriteBack.fire) { + bufferState := s_idle + req_valid := false.B + curPtr := 0.U + unSentLoads := 0.U + globalException := false.B + globalMMIO := false.B + needWakeUpReqsReg := false.B + needWakeUpWB := false.B + } - is (s_wait) { - when(io.rob.lcommit =/= 0.U || req.uop.robIdx.needFlush(io.redirect)) { - // rob commits the unaligned load or handled the exception, reset all state - bufferState := s_idle - req_valid := false.B - curPtr := 0.U - unSentLoads := 0.U - globalException := false.B - globalMMIO := false.B + } .otherwise { + when(io.writeBack.fire) { + bufferState := s_idle + req_valid := false.B + curPtr := 0.U + unSentLoads := 0.U + globalException := false.B + globalMMIO := false.B + needWakeUpReqsReg := false.B + needWakeUpWB := false.B + } } + } } - val highAddress = LookupTree(req.uop.fuOpType(1, 0), List( + val alignedType = Mux(req.isvec, req.alignedType(1,0), req.uop.fuOpType(1, 0)) + val highAddress = LookupTree(alignedType, List( LB -> 0.U, LH -> 1.U, LW -> 3.U, @@ -277,17 +301,7 @@ class LoadMisalignBuffer(implicit p: Parameters) extends XSModule when (bufferState === s_split) { when (!cross16BytesBoundary) { - // change this unaligned load into a 128 bits load - unSentLoads := 1.U - curPtr := 0.U - new128Load.vaddr := aligned16BytesAddr - new128Load.fullva := req.fullva - // new128Load.mask := (getMask(req.uop.fuOpType(1, 0)) << aligned16BytesSel).asUInt - new128Load.mask := 0xffff.U - new128Load.uop := req.uop - new128Load.uop.exceptionVec(loadAddrMisaligned) := false.B - new128Load.is128bit := true.B - splitLoadReqs(0) := new128Load + assert(false.B, s"There should be no non-aligned access that does not cross 16Byte boundaries.") } .otherwise { // split this unaligned load into `maxSplitNum` aligned loads unSentLoads := Fill(maxSplitNum, 1.U(1.W)) @@ -299,7 +313,7 @@ class LoadMisalignBuffer(implicit p: Parameters) extends XSModule highAddrLoad.uop.exceptionVec(loadAddrMisaligned) := false.B highAddrLoad.fullva := req.fullva - switch (req.uop.fuOpType(1, 0)) { + switch (alignedType(1, 0)) { is (LB) { assert(false.B, "lb should not trigger miss align") } @@ -481,13 +495,17 @@ class LoadMisalignBuffer(implicit p: Parameters) extends XSModule exceptionVec := 0.U.asTypeOf(exceptionVec.cloneType) } - io.splitLoadReq.valid := req_valid && (bufferState === s_req) + io.splitLoadReq.valid := req_valid && (bufferState === s_req || bufferState === s_comb_wakeup_rep && needWakeUpReqsReg && !req.isvec) io.splitLoadReq.bits := splitLoadReqs(curPtr) + io.splitLoadReq.bits.isvec := req.isvec + io.splitLoadReq.bits.misalignNeedWakeUp := needWakeUpReqsWire + io.splitLoadReq.bits.isFinalSplit := curPtr(0) && !needWakeUpReqsWire // Restore the information of H extension load // bit encoding: | hlv 1 | hlvx 1 | is unsigned(1bit) | size(2bit) | val reqIsHlv = LSUOpType.isHlv(req.uop.fuOpType) val reqIsHlvx = LSUOpType.isHlvx(req.uop.fuOpType) - io.splitLoadReq.bits.uop.fuOpType := Cat(reqIsHlv, reqIsHlvx, 0.U(1.W), splitLoadReqs(curPtr).uop.fuOpType(1, 0)) + io.splitLoadReq.bits.uop.fuOpType := Mux(req.isvec, req.uop.fuOpType, Cat(reqIsHlv, reqIsHlvx, 0.U(1.W), splitLoadReqs(curPtr).uop.fuOpType(1, 0))) + io.splitLoadReq.bits.alignedType := Mux(req.isvec, splitLoadReqs(curPtr).uop.fuOpType(1, 0), req.alignedType) when (io.splitLoadResp.valid) { val resp = io.splitLoadResp.bits @@ -509,61 +527,34 @@ class LoadMisalignBuffer(implicit p: Parameters) extends XSModule val combinedData = RegInit(0.U(XLEN.W)) - when (bufferState === s_comb) { - when (!cross16BytesBoundary) { - val shiftData = LookupTree(aligned16BytesSel, List( - "b0000".U -> splitLoadResp(0).data(63, 0), - "b0001".U -> splitLoadResp(0).data(71, 8), - "b0010".U -> splitLoadResp(0).data(79, 16), - "b0011".U -> splitLoadResp(0).data(87, 24), - "b0100".U -> splitLoadResp(0).data(95, 32), - "b0101".U -> splitLoadResp(0).data(103, 40), - "b0110".U -> splitLoadResp(0).data(111, 48), - "b0111".U -> splitLoadResp(0).data(119, 56), - "b1000".U -> splitLoadResp(0).data(127, 64), - "b1001".U -> splitLoadResp(0).data(127, 72), - "b1010".U -> splitLoadResp(0).data(127, 80), - "b1011".U -> splitLoadResp(0).data(127, 88), - "b1100".U -> splitLoadResp(0).data(127, 96), - "b1101".U -> splitLoadResp(0).data(127, 104), - "b1110".U -> splitLoadResp(0).data(127, 112), - "b1111".U -> splitLoadResp(0).data(127, 120) - )) - val truncateData = LookupTree(req.uop.fuOpType(1, 0), List( - LB -> shiftData(7, 0), // lb - LH -> shiftData(15, 0), // lh - LW -> shiftData(31, 0), // lw - LD -> shiftData(63, 0) // ld - )) - combinedData := rdataHelper(req.uop, truncateData(XLEN - 1, 0)) - } .otherwise { - val lowAddrResult = getShiftAndTruncateData(lowResultShift, lowResultWidth, splitLoadResp(0).data) - .asTypeOf(Vec(XLEN / 8, UInt(8.W))) - val highAddrResult = getShiftAndTruncateData(highResultShift, highResultWidth, splitLoadResp(1).data) - .asTypeOf(Vec(XLEN / 8, UInt(8.W))) - val catResult = Wire(Vec(XLEN / 8, UInt(8.W))) - (0 until XLEN / 8) .map { - case i => { - when (i.U < lowResultWidth) { - catResult(i) := lowAddrResult(i) - } .otherwise { - catResult(i) := highAddrResult(i.U - lowResultWidth) - } + when (bufferState === s_comb_wakeup_rep) { + val lowAddrResult = getShiftAndTruncateData(lowResultShift, lowResultWidth, splitLoadResp(0).data) + .asTypeOf(Vec(XLEN / 8, UInt(8.W))) + val highAddrResult = getShiftAndTruncateData(highResultShift, highResultWidth, splitLoadResp(1).data) + .asTypeOf(Vec(XLEN / 8, UInt(8.W))) + val catResult = Wire(Vec(XLEN / 8, UInt(8.W))) + (0 until XLEN / 8) .map { + case i => { + when (i.U < lowResultWidth) { + catResult(i) := lowAddrResult(i) + } .otherwise { + catResult(i) := highAddrResult(i.U - lowResultWidth) } } - combinedData := rdataHelper(req.uop, (catResult.asUInt)(XLEN - 1, 0)) } + combinedData := Mux(req.isvec, rdataVecHelper(req.alignedType, (catResult.asUInt)(XLEN - 1, 0)), rdataHelper(req.uop, (catResult.asUInt)(XLEN - 1, 0))) + } - io.writeBack.valid := req_valid && (bufferState === s_wb) + io.writeBack.valid := req_valid && (bufferState === s_wb) && (io.splitLoadResp.valid && io.splitLoadResp.bits.misalignNeedWakeUp || globalMMIO || globalException) && !io.loadOutValid && !req.isvec io.writeBack.bits.uop := req.uop io.writeBack.bits.uop.exceptionVec := DontCare LduCfg.exceptionOut.map(no => io.writeBack.bits.uop.exceptionVec(no) := (globalMMIO || globalException) && exceptionVec(no)) io.writeBack.bits.uop.fuType := FuType.ldu.U - io.writeBack.bits.uop.flushPipe := Mux(globalMMIO || globalException, false.B, true.B) + io.writeBack.bits.uop.flushPipe := false.B io.writeBack.bits.uop.replayInst := false.B - io.writeBack.bits.data := combinedData - io.writeBack.bits.isFromLoadUnit := DontCare + io.writeBack.bits.data := newRdataHelper(data_select, combinedData) + io.writeBack.bits.isFromLoadUnit := needWakeUpWB io.writeBack.bits.debug.isMMIO := globalMMIO // FIXME lyq: temporarily set to false io.writeBack.bits.debug.isNC := false.B @@ -571,9 +562,38 @@ class LoadMisalignBuffer(implicit p: Parameters) extends XSModule io.writeBack.bits.debug.paddr := req.paddr io.writeBack.bits.debug.vaddr := req.vaddr + + // vector output + io.vecWriteBack.valid := req_valid && (bufferState === s_wb) && !io.loadVecOutValid && req.isvec + + io.vecWriteBack.bits.alignedType := req.alignedType + io.vecWriteBack.bits.vecFeedback := true.B + io.vecWriteBack.bits.vecdata.get := combinedData + io.vecWriteBack.bits.isvec := req.isvec + io.vecWriteBack.bits.elemIdx := req.elemIdx + io.vecWriteBack.bits.elemIdxInsideVd.get := req.elemIdxInsideVd + io.vecWriteBack.bits.mask := req.mask + io.vecWriteBack.bits.reg_offset.get := 0.U + io.vecWriteBack.bits.usSecondInv := req.usSecondInv + io.vecWriteBack.bits.mBIndex := req.mbIndex + io.vecWriteBack.bits.hit := true.B + io.vecWriteBack.bits.sourceType := RSFeedbackType.lrqFull + io.vecWriteBack.bits.trigger := TriggerAction.None + io.vecWriteBack.bits.flushState := DontCare + io.vecWriteBack.bits.exceptionVec := ExceptionNO.selectByFu(exceptionVec, VlduCfg) + io.vecWriteBack.bits.vaddr := req.fullva + io.vecWriteBack.bits.vaNeedExt := req.vaNeedExt + io.vecWriteBack.bits.gpaddr := req.gpaddr + io.vecWriteBack.bits.isForVSnonLeafPTE := req.isForVSnonLeafPTE + io.vecWriteBack.bits.mmio := DontCare + io.vecWriteBack.bits.vstart := req.uop.vpu.vstart + io.vecWriteBack.bits.vecTriggerMask := req.vecTriggerMask + io.vecWriteBack.bits.nc := false.B + + val flush = req_valid && req.uop.robIdx.needFlush(io.redirect) - when (flush && (bufferState =/= s_idle)) { + when (flush) { bufferState := s_idle req_valid := false.B curPtr := 0.U @@ -596,7 +616,9 @@ class LoadMisalignBuffer(implicit p: Parameters) extends XSModule val overwriteIsHyper = RegEnable(splitLoadResp(curPtr).isHyper, shouldOverwrite) val overwriteIsForVSnonLeafPTE = RegEnable(splitLoadResp(curPtr).isForVSnonLeafPTE, shouldOverwrite) - io.overwriteExpBuf.valid := overwriteExpBuf + //TODO In theory, there is no need to overwrite, but for now, the signal is retained in the code in this way. + // and the signal will be removed after sufficient verification. + io.overwriteExpBuf.valid := false.B io.overwriteExpBuf.vaddr := overwriteVaddr io.overwriteExpBuf.isHyper := overwriteIsHyper io.overwriteExpBuf.gpaddr := overwriteGpaddr diff --git a/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala b/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala index e42dcd5efc..c6012df65c 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala @@ -191,7 +191,7 @@ class LoadQueue(implicit p: Parameters) extends XSModule val rob = Flipped(new RobLsqIO) val uncache = new UncacheWordIO val exceptionAddr = new ExceptionAddrIO - val flushFrmMaBuf = Input(Bool()) + val loadMisalignFull = Input(Bool()) val lqFull = Output(Bool()) val lqDeq = Output(UInt(log2Up(CommitWidth + 1).W)) val lqCancelCnt = Output(UInt(log2Up(VirtualLoadQueueSize+1).W)) @@ -277,7 +277,8 @@ class LoadQueue(implicit p: Parameters) extends XSModule // mmio non-data error exception exceptionBuffer.io.req(LoadPipelineWidth + VecLoadPipelineWidth) := uncacheBuffer.io.exception exceptionBuffer.io.req(LoadPipelineWidth + VecLoadPipelineWidth).bits.vaNeedExt := true.B - exceptionBuffer.io.flushFrmMaBuf := io.flushFrmMaBuf + + loadQueueReplay.io.loadMisalignFull := io.loadMisalignFull io.exceptionAddr <> exceptionBuffer.io.exceptionAddr @@ -327,6 +328,7 @@ class LoadQueue(implicit p: Parameters) extends XSModule loadQueueReplay.io.l2_hint <> io.l2_hint loadQueueReplay.io.tlb_hint <> io.tlb_hint loadQueueReplay.io.tlbReplayDelayCycleCtrl <> io.tlbReplayDelayCycleCtrl + // TODO: implement it! loadQueueReplay.io.vecFeedback := io.vecFeedback diff --git a/src/main/scala/xiangshan/mem/lsqueue/LoadQueueReplay.scala b/src/main/scala/xiangshan/mem/lsqueue/LoadQueueReplay.scala index ee557cc05d..f81ab9b0ae 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/LoadQueueReplay.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/LoadQueueReplay.scala @@ -68,8 +68,10 @@ object LoadReplayCauses { val C_RAW = 8 // st-ld violation val C_NK = 9 + // misalignBuffer Full + val C_MF = 10 // total causes - val allCauses = 10 + val allCauses = 11 } class VecReplayInfo(implicit p: Parameters) extends XSBundle with HasVLSUParameters { @@ -203,6 +205,7 @@ class LoadQueueReplay(implicit p: Parameters) extends XSModule val ldWbPtr = Input(new LqPtr) val rarFull = Input(Bool()) val rawFull = Input(Bool()) + val loadMisalignFull = Input(Bool()) val l2_hint = Input(Valid(new L2ToL1Hint())) val tlb_hint = Flipped(new TlbHintIO) val tlbReplayDelayCycleCtrl = Vec(4, Input(UInt(ReSelectLen.W))) @@ -359,6 +362,10 @@ class LoadQueueReplay(implicit p: Parameters) extends XSModule when (cause(i)(LoadReplayCauses.C_RAW)) { blocking(i) := Mux((!io.rawFull || !isAfter(uop(i).sqIdx, io.stAddrReadySqPtr)), false.B, blocking(i)) } + // case C_MF + when (cause(i)(LoadReplayCauses.C_MF)) { + blocking(i) := Mux(!io.loadMisalignFull, false.B, blocking(i)) + } }) // Replay is splitted into 3 stages diff --git a/src/main/scala/xiangshan/mem/lsqueue/StoreMisalignBuffer.scala b/src/main/scala/xiangshan/mem/lsqueue/StoreMisalignBuffer.scala index fd0be8652a..31bb1a0c01 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/StoreMisalignBuffer.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/StoreMisalignBuffer.scala @@ -30,8 +30,9 @@ import xiangshan.frontend.FtqPtr import xiangshan.ExceptionNO._ import xiangshan.cache.wpu.ReplayCarry import xiangshan.backend.rob.RobPtr -import xiangshan.backend.Bundles.{MemExuOutput, DynInst} +import xiangshan.backend.Bundles._ import xiangshan.backend.fu.FuConfig.StaCfg +import xiangshan.backend.fu.FuType.isVStore class StoreMisalignBuffer(implicit p: Parameters) extends XSModule with HasCircularQueuePtrHelper @@ -63,35 +64,45 @@ class StoreMisalignBuffer(implicit p: Parameters) extends XSModule SD -> 0xff.U )) - def selectOldest[T <: LsPipelineBundle](valid: Seq[Bool], bits: Seq[T]): (Seq[Bool], Seq[T]) = { + def selectOldest[T <: LsPipelineBundle](valid: Seq[Bool], bits: Seq[T], index: Seq[UInt]): (Seq[Bool], Seq[T], Seq[UInt]) = { assert(valid.length == bits.length) if (valid.length == 0 || valid.length == 1) { - (valid, bits) + (valid, bits, index) } else if (valid.length == 2) { val res = Seq.fill(2)(Wire(ValidIO(chiselTypeOf(bits(0))))) + val resIndex = Seq.fill(2)(Wire(chiselTypeOf(index(0)))) for (i <- res.indices) { res(i).valid := valid(i) res(i).bits := bits(i) + resIndex(i) := index(i) } val oldest = Mux(valid(0) && valid(1), Mux(isAfter(bits(0).uop.robIdx, bits(1).uop.robIdx) || (isNotBefore(bits(0).uop.robIdx, bits(1).uop.robIdx) && bits(0).uop.uopIdx > bits(1).uop.uopIdx), res(1), res(0)), Mux(valid(0) && !valid(1), res(0), res(1))) - (Seq(oldest.valid), Seq(oldest.bits)) + + val oldestIndex = Mux(valid(0) && valid(1), + Mux(isAfter(bits(0).uop.robIdx, bits(1).uop.robIdx) || + (isNotBefore(bits(0).uop.robIdx, bits(1).uop.robIdx) && bits(0).uop.uopIdx > bits(1).uop.uopIdx), resIndex(1), resIndex(0)), + Mux(valid(0) && !valid(1), resIndex(0), resIndex(1))) + (Seq(oldest.valid), Seq(oldest.bits), Seq(oldestIndex)) } else { - val left = selectOldest(valid.take(valid.length / 2), bits.take(bits.length / 2)) - val right = selectOldest(valid.takeRight(valid.length - (valid.length / 2)), bits.takeRight(bits.length - (bits.length / 2))) - selectOldest(left._1 ++ right._1, left._2 ++ right._2) + val left = selectOldest(valid.take(valid.length / 2), bits.take(bits.length / 2), index.take(index.length / 2)) + val right = selectOldest(valid.takeRight(valid.length - (valid.length / 2)), bits.takeRight(bits.length - (bits.length / 2)), index.takeRight(index.length - (index.length / 2))) + selectOldest(left._1 ++ right._1, left._2 ++ right._2, left._3 ++ right._3) } } val io = IO(new Bundle() { val redirect = Flipped(Valid(new Redirect)) - val req = Vec(enqPortNum, Flipped(Valid(new LsPipelineBundle))) + val req = Vec(enqPortNum, Flipped(Decoupled(new LsPipelineBundle))) val rob = Flipped(new RobLsqIO) val splitStoreReq = Decoupled(new LsPipelineBundle) val splitStoreResp = Flipped(Valid(new SqWriteBundle)) val writeBack = Decoupled(new MemExuOutput) + val vecWriteBack = Vec(VecStorePipelineWidth, Decoupled(new VecPipelineFeedbackIO(isVStore = true))) + val storeOutValid = Input(Bool()) + val storeVecOutValid = Input(Bool()) val overwriteExpBuf = Output(new XSBundle { val valid = Bool() val vaddr = UInt(XLEN.W) @@ -100,66 +111,86 @@ class StoreMisalignBuffer(implicit p: Parameters) extends XSModule val isForVSnonLeafPTE = Bool() }) val sqControl = new StoreMaBufToSqControlIO + + val toVecStoreMergeBuffer = Vec(VecStorePipelineWidth, new StoreMaBufToVecStoreMergeBufferIO) + val full = Bool() }) io.rob.mmio := 0.U.asTypeOf(Vec(LoadPipelineWidth, Bool())) io.rob.uop := 0.U.asTypeOf(Vec(LoadPipelineWidth, new DynInst)) + class StoreMisalignBufferEntry(implicit p: Parameters) extends LsPipelineBundle { + val portIndex = UInt(log2Up(enqPortNum).W) + } val req_valid = RegInit(false.B) - val req = Reg(new LsPipelineBundle) + val req = Reg(new StoreMisalignBufferEntry) + + val robMatch = req_valid && io.rob.pendingst && (io.rob.pendingPtr === req.uop.robIdx) + val cross4KBPageBoundary = Wire(Bool()) + val needFlushPipe = RegInit(false.B) // enqueue // s1: val s1_req = VecInit(io.req.map(_.bits)) val s1_valid = VecInit(io.req.map(x => x.valid)) - // s2: delay 1 cycle - val s2_req = RegNext(s1_req) - val s2_valid = (0 until enqPortNum).map(i => - RegNext(s1_valid(i)) && - !s2_req(i).uop.robIdx.needFlush(RegNext(io.redirect)) && - !s2_req(i).uop.robIdx.needFlush(io.redirect) - ) - val s2_miss_aligned = s2_req.map(x => - x.uop.exceptionVec(storeAddrMisaligned) && !x.uop.exceptionVec(breakPoint) && !TriggerAction.isDmode(x.uop.trigger) - ) - - val s2_enqueue = Wire(Vec(enqPortNum, Bool())) - for (w <- 0 until enqPortNum) { - s2_enqueue(w) := s2_valid(w) && s2_miss_aligned(w) - } + val s1_index = (0 until io.req.length).map(_.asUInt) + val reqSel = selectOldest(s1_valid, s1_req, s1_index) + + val reqSelValid = reqSel._1(0) + val reqSelBits = reqSel._2(0) + val reqSelPort = reqSel._3(0) - when (req_valid && req.uop.robIdx.needFlush(io.redirect)) { - req_valid := s2_enqueue.asUInt.orR - } .elsewhen (s2_enqueue.asUInt.orR) { - req_valid := req_valid || true.B + val reqRedirect = reqSelBits.uop.robIdx.needFlush(io.redirect) + + val canEnq = !req_valid && !reqRedirect && reqSelValid + when(canEnq) { + connectSamePort(req, reqSelBits) + req.portIndex := reqSelPort + req_valid := true.B + } + val cross4KBPageEnq = WireInit(false.B) + when (cross4KBPageBoundary && !reqRedirect) { + when(reqSelValid && (isAfter(req.uop.robIdx, reqSelBits.uop.robIdx) || (isNotBefore(req.uop.robIdx, reqSelBits.uop.robIdx) && req.uop.uopIdx > reqSelBits.uop.uopIdx))) { + connectSamePort(req, reqSelBits) + req.portIndex := reqSelPort + cross4KBPageEnq := true.B + needFlushPipe := true.B + } .otherwise { + req := req + cross4KBPageEnq := false.B + } } - val reqSel = selectOldest(s2_enqueue, s2_req) + val reqSelCanEnq = UIntToOH(reqSelPort) - when (req_valid) { - req := Mux( - reqSel._1(0) && (isAfter(req.uop.robIdx, reqSel._2(0).uop.robIdx) || (isNotBefore(req.uop.robIdx, reqSel._2(0).uop.robIdx) && req.uop.uopIdx > reqSel._2(0).uop.uopIdx)), - reqSel._2(0), - req) - } .elsewhen (s2_enqueue.asUInt.orR) { - req := reqSel._2(0) + io.req.zipWithIndex.map{ + case (reqPort, index) => reqPort.ready := reqSelCanEnq(index) && (!req_valid || cross4KBPageBoundary && cross4KBPageEnq) } - val robMatch = req_valid && io.rob.pendingst && (io.rob.pendingPtr === req.uop.robIdx) + + io.toVecStoreMergeBuffer.zipWithIndex.map{ + case (toStMB, index) => { + toStMB.flush := req_valid && cross4KBPageBoundary && cross4KBPageEnq && UIntToOH(req.portIndex)(index) + toStMB.mbIndex := req.mbIndex + } + } + io.full := req_valid // buffer control: // - split miss-aligned store into aligned stores // - send split store to sta and get result from sta // - control sq write to sb // - control sq write this store back - val s_idle :: s_split :: s_req :: s_resp :: s_cal :: s_sq_req :: s_wb :: s_wait :: Nil = Enum(8) - val bufferState = RegInit(s_idle) + // s_block When cross page, the high page address needs to be provided to sbuffer, so the + val s_idle :: s_split :: s_req :: s_resp :: s_wb :: s_block :: s_wait :: Nil = Enum(7) + val bufferState = RegInit(s_idle) val splitStoreReqs = RegInit(VecInit(List.fill(maxSplitNum)(0.U.asTypeOf(new LsPipelineBundle)))) val splitStoreResp = RegInit(VecInit(List.fill(maxSplitNum)(0.U.asTypeOf(new SqWriteBundle)))) - val exceptionVec = RegInit(0.U.asTypeOf(ExceptionVec())) - val unSentStores = RegInit(0.U(maxSplitNum.W)) - val unWriteStores = RegInit(0.U(maxSplitNum.W)) + val isCrossPage = RegInit(false.B) + val exceptionVec = RegInit(0.U.asTypeOf(ExceptionVec())) + val unSentStores = RegInit(0.U(maxSplitNum.W)) + val unWriteStores = RegInit(0.U(maxSplitNum.W)) val curPtr = RegInit(0.U(log2Ceil(maxSplitNum).W)) // if there is exception or mmio in split store @@ -169,11 +200,26 @@ class StoreMisalignBuffer(implicit p: Parameters) extends XSModule val hasException = ExceptionNO.selectByFu(io.splitStoreResp.bits.uop.exceptionVec, StaCfg).asUInt.orR && !io.splitStoreResp.bits.need_rep val isMMIO = io.splitStoreResp.bits.mmio && !io.splitStoreResp.bits.need_rep + io.sqControl.toStoreQueue.crossPageWithHit := io.sqControl.toStoreMisalignBuffer.sqPtr === req.uop.sqIdx && isCrossPage + io.sqControl.toStoreQueue.crossPageCanDeq := !isCrossPage || bufferState === s_block + io.sqControl.toStoreQueue.paddr := Cat(splitStoreResp(1).paddr(splitStoreResp(1).paddr.getWidth - 1, 3), 0.U(3.W)) + + io.sqControl.toStoreQueue.withSameUop := io.sqControl.toStoreMisalignBuffer.uop.robIdx === req.uop.robIdx && io.sqControl.toStoreMisalignBuffer.uop.uopIdx === req.uop.uopIdx && req.isvec && robMatch && isCrossPage + switch(bufferState) { is (s_idle) { - when (robMatch) { - bufferState := s_split + when(cross4KBPageBoundary) { + when(robMatch) { + bufferState := s_split + isCrossPage := true.B + } + } .otherwise { + when (req_valid) { + bufferState := s_split + isCrossPage := false.B + } } + } is (s_split) { @@ -195,36 +241,60 @@ class StoreMisalignBuffer(implicit p: Parameters) extends XSModule bufferState := s_wb globalException := hasException globalMMIO := isMMIO - } .elsewhen(io.splitStoreResp.bits.need_rep || (unSentStores & ~clearOh).orR) { + } .elsewhen(io.splitStoreResp.bits.need_rep || (unSentStores & (~clearOh).asUInt).orR) { // need replay or still has unsent requests bufferState := s_req } .otherwise { // got result, goto calculate data and control sq - bufferState := s_cal + bufferState := s_wb } } } - is (s_cal) { - when (io.sqControl.storeInfo.dataReady) { - bufferState := s_sq_req + is (s_wb) { + when (req.isvec) { + when (io.vecWriteBack.map(x => x.fire).reduce( _ || _)) { + bufferState := s_idle + req_valid := false.B + curPtr := 0.U + unSentStores := 0.U + unWriteStores := 0.U + globalException := false.B + globalMMIO := false.B + isCrossPage := false.B + needFlushPipe := false.B + } + } + when (io.writeBack.fire && (!isCrossPage || globalMMIO || globalException)) { + bufferState := s_idle + req_valid := false.B curPtr := 0.U + unSentStores := 0.U + unWriteStores := 0.U + globalException := false.B + globalMMIO := false.B + isCrossPage := false.B + needFlushPipe := false.B + } .elsewhen(io.writeBack.fire && isCrossPage) { + bufferState := s_block + } .otherwise { + bufferState := s_wb } } - is (s_sq_req) { - when (io.sqControl.storeInfo.completeSbTrans) { - when (!((unWriteStores & ~UIntToOH(curPtr)).orR)) { - bufferState := s_wb - } + is (s_block) { + when (io.sqControl.toStoreMisalignBuffer.doDeq) { + bufferState := s_idle + req_valid := false.B + curPtr := 0.U + unSentStores := 0.U + unWriteStores := 0.U + globalException := false.B + globalMMIO := false.B + isCrossPage := false.B } } - is (s_wb) { - when (io.writeBack.fire) { - bufferState := s_wait - } - } is (s_wait) { when (io.rob.scommit =/= 0.U || req.uop.robIdx.needFlush(io.redirect)) { @@ -237,17 +307,28 @@ class StoreMisalignBuffer(implicit p: Parameters) extends XSModule globalException := false.B globalMMIO := false.B } + assert(false.B, s"The State of the storeMisalignBuffer should not reach wait.") } } - val highAddress = LookupTree(req.uop.fuOpType(1, 0), List( + val alignedType = Mux(req.isvec, req.alignedType(1,0), req.uop.fuOpType(1, 0)) + + val highAddress = LookupTree(alignedType, List( SB -> 0.U, SH -> 1.U, SW -> 3.U, SD -> 7.U )) + req.vaddr(4, 0) + + val highPageAddress = LookupTree(alignedType, List( + SB -> 0.U, + SH -> 1.U, + SW -> 3.U, + SD -> 7.U + )) + req.vaddr(12, 0) // to see if (vaddr + opSize - 1) and vaddr are in the same 16 bytes region val cross16BytesBoundary = req_valid && (highAddress(4) =/= req.vaddr(4)) + cross4KBPageBoundary := req_valid && (highPageAddress(12) =/= req.vaddr(12)) val aligned16BytesAddr = (req.vaddr >> 4) << 4// req.vaddr & ~("b1111".U) val aligned16BytesSel = req.vaddr(3, 0) @@ -263,17 +344,7 @@ class StoreMisalignBuffer(implicit p: Parameters) extends XSModule when (bufferState === s_split) { when (!cross16BytesBoundary) { - // change this unaligned store into a 128 bits store - unWriteStores := 1.U - unSentStores := 1.U - curPtr := 0.U - new128Store.vaddr := aligned16BytesAddr - // new128Store.mask := (getMask(req.uop.fuOpType(1, 0)) << aligned16BytesSel).asUInt - new128Store.mask := 0xffff.U - new128Store.uop := req.uop - new128Store.uop.exceptionVec(storeAddrMisaligned) := false.B - new128Store.is128bit := true.B - splitStoreReqs(0) := new128Store + assert(false.B, s"There should be no non-aligned access that does not cross 16Byte boundaries.") } .otherwise { // split this unaligned store into `maxSplitNum` aligned stores unWriteStores := Fill(maxSplitNum, 1.U(1.W)) @@ -284,7 +355,7 @@ class StoreMisalignBuffer(implicit p: Parameters) extends XSModule highAddrStore.uop := req.uop highAddrStore.uop.exceptionVec(storeAddrMisaligned) := false.B - switch (req.uop.fuOpType(1, 0)) { + switch (alignedType(1, 0)) { is (SB) { assert(false.B, "lb should not trigger miss align") } @@ -445,10 +516,13 @@ class StoreMisalignBuffer(implicit p: Parameters) extends XSModule io.splitStoreReq.valid := req_valid && (bufferState === s_req) io.splitStoreReq.bits := splitStoreReqs(curPtr) + io.splitStoreReq.bits.is128bit := req.isvec // Restore the information of H extension store // bit encoding: | hsv 1 | store 00 | size(2bit) | val reqIsHsv = LSUOpType.isHsv(req.uop.fuOpType) - io.splitStoreReq.bits.uop.fuOpType := Cat(reqIsHsv, 0.U(2.W), splitStoreReqs(curPtr).uop.fuOpType(1, 0)) + io.splitStoreReq.bits.uop.fuOpType := Mux(req.isvec, req.uop.fuOpType, Cat(reqIsHsv, 0.U(2.W), splitStoreReqs(curPtr).uop.fuOpType(1, 0))) + io.splitStoreReq.bits.alignedType := Mux(req.isvec, splitStoreReqs(curPtr).uop.fuOpType(1, 0), req.alignedType) + io.splitStoreReq.bits.isFinalSplit := curPtr(0) when (io.splitStoreResp.valid) { val resp = io.splitStoreResp.bits @@ -464,7 +538,7 @@ class StoreMisalignBuffer(implicit p: Parameters) extends XSModule unSentStores := 0.U StaCfg.exceptionOut.map(no => exceptionVec(no) := exceptionVec(no) || resp.uop.exceptionVec(no)) } .elsewhen (!io.splitStoreResp.bits.need_rep) { - unSentStores := unSentStores & ~UIntToOH(curPtr) + unSentStores := unSentStores & (~UIntToOH(curPtr)).asUInt curPtr := curPtr + 1.U exceptionVec := 0.U.asTypeOf(ExceptionVec()) } @@ -475,7 +549,6 @@ class StoreMisalignBuffer(implicit p: Parameters) extends XSModule val wmask = UInt((VLEN / 8).W) })))) - val unalignedStoreData = io.sqControl.storeInfo.data val wmaskLow = Wire(Vec(VLEN / 8, Bool())) val wmaskHigh = Wire(Vec(VLEN / 8, Bool())) (0 until (VLEN / 8)).map { @@ -493,89 +566,13 @@ class StoreMisalignBuffer(implicit p: Parameters) extends XSModule } } - when (bufferState === s_cal) { - when (!cross16BytesBoundary) { - splitStoreData(0).wdata := LookupTree(aligned16BytesSel, List( - "b0000".U -> unalignedStoreData, - "b0001".U -> Cat(unalignedStoreData, 0.U(( 1 * 8).W)), - "b0010".U -> Cat(unalignedStoreData, 0.U(( 2 * 8).W)), - "b0011".U -> Cat(unalignedStoreData, 0.U(( 3 * 8).W)), - "b0100".U -> Cat(unalignedStoreData, 0.U(( 4 * 8).W)), - "b0101".U -> Cat(unalignedStoreData, 0.U(( 5 * 8).W)), - "b0110".U -> Cat(unalignedStoreData, 0.U(( 6 * 8).W)), - "b0111".U -> Cat(unalignedStoreData, 0.U(( 7 * 8).W)), - "b1000".U -> Cat(unalignedStoreData, 0.U(( 8 * 8).W)), - "b1001".U -> Cat(unalignedStoreData, 0.U(( 9 * 8).W)), - "b1010".U -> Cat(unalignedStoreData, 0.U((10 * 8).W)), - "b1011".U -> Cat(unalignedStoreData, 0.U((11 * 8).W)), - "b1100".U -> Cat(unalignedStoreData, 0.U((12 * 8).W)), - "b1101".U -> Cat(unalignedStoreData, 0.U((13 * 8).W)), - "b1110".U -> Cat(unalignedStoreData, 0.U((14 * 8).W)), - "b1111".U -> Cat(unalignedStoreData, 0.U((15 * 8).W)) - ))(VLEN - 1, 0) - splitStoreData(0).wmask := getMask(req.uop.fuOpType(1, 0)) << aligned16BytesSel - } .otherwise { - // low 16bytes part - val catData = LookupTree(lowResultWidth, List( - BYTE0 -> unalignedStoreData, - BYTE1 -> Cat(unalignedStoreData, 0.U((8 * 15).W)), - BYTE2 -> Cat(unalignedStoreData, 0.U((8 * 14).W)), - BYTE3 -> Cat(unalignedStoreData, 0.U((8 * 13).W)), - BYTE4 -> Cat(unalignedStoreData, 0.U((8 * 12).W)), - BYTE5 -> Cat(unalignedStoreData, 0.U((8 * 11).W)), - BYTE6 -> Cat(unalignedStoreData, 0.U((8 * 10).W)), - BYTE7 -> Cat(unalignedStoreData, 0.U((8 * 9).W)) - )) - splitStoreData(0).wdata := catData(VLEN - 1, 0) - splitStoreData(0).wmask := VecInit(wmaskLow.reverse).asUInt - // high 16bytes part - val shiftData = LookupTree(lowResultWidth, List( - BYTE0 -> unalignedStoreData(VLEN - 1, 0), - BYTE1 -> unalignedStoreData(VLEN - 1, 8), - BYTE2 -> unalignedStoreData(VLEN - 1, 16), - BYTE3 -> unalignedStoreData(VLEN - 1, 24), - BYTE4 -> unalignedStoreData(VLEN - 1, 32), - BYTE5 -> unalignedStoreData(VLEN - 1, 40), - BYTE6 -> unalignedStoreData(VLEN - 1, 48), - BYTE7 -> unalignedStoreData(VLEN - 1, 56) - )) - splitStoreData(1).wdata := LookupTree(highResultWidth, List( - BYTE0 -> ZeroExt(shiftData, VLEN), - BYTE1 -> ZeroExt(shiftData(7, 0), VLEN), - BYTE2 -> ZeroExt(shiftData(15, 0), VLEN), - BYTE3 -> ZeroExt(shiftData(23, 0), VLEN), - BYTE4 -> ZeroExt(shiftData(31, 0), VLEN), - BYTE5 -> ZeroExt(shiftData(39, 0), VLEN), - BYTE6 -> ZeroExt(shiftData(47, 0), VLEN), - BYTE7 -> ZeroExt(shiftData(55, 0), VLEN) - )) - splitStoreData(1).wmask := wmaskHigh.asUInt - } - } - - io.sqControl.control.hasException := req_valid && globalException - - io.sqControl.control.writeSb := bufferState === s_sq_req - io.sqControl.control.wdata := splitStoreData(curPtr).wdata - io.sqControl.control.wmask := splitStoreData(curPtr).wmask - // the paddr and vaddr is not corresponding to the exact addr of - io.sqControl.control.paddr := splitStoreResp(curPtr).paddr - io.sqControl.control.vaddr := splitStoreResp(curPtr).vaddr - io.sqControl.control.last := !((unWriteStores & ~UIntToOH(curPtr)).orR) - - when (bufferState === s_sq_req) { - when (io.sqControl.storeInfo.completeSbTrans) { - unWriteStores := unWriteStores & ~UIntToOH(curPtr) - curPtr := curPtr + 1.U - } - } - io.writeBack.valid := req_valid && (bufferState === s_wb) && io.sqControl.storeInfo.dataReady + io.writeBack.valid := req_valid && (bufferState === s_wb) && !io.storeOutValid && !req.isvec io.writeBack.bits.uop := req.uop io.writeBack.bits.uop.exceptionVec := DontCare StaCfg.exceptionOut.map(no => io.writeBack.bits.uop.exceptionVec(no) := (globalMMIO || globalException) && exceptionVec(no)) - io.writeBack.bits.uop.flushPipe := Mux(globalMMIO || globalException, false.B, true.B) + io.writeBack.bits.uop.flushPipe := needFlushPipe io.writeBack.bits.uop.replayInst := false.B - io.writeBack.bits.data := unalignedStoreData + io.writeBack.bits.data := DontCare io.writeBack.bits.isFromLoadUnit := DontCare io.writeBack.bits.debug.isMMIO := globalMMIO // FIXME lyq: temporarily set to false @@ -584,18 +581,45 @@ class StoreMisalignBuffer(implicit p: Parameters) extends XSModule io.writeBack.bits.debug.paddr := req.paddr io.writeBack.bits.debug.vaddr := req.vaddr - io.sqControl.control.removeSq := req_valid && (bufferState === s_wait) && !(globalMMIO || globalException) && (io.rob.scommit =/= 0.U) + io.vecWriteBack.zipWithIndex.map{ + case (wb, index) => { + wb.valid := req_valid && (bufferState === s_wb) && req.isvec && !io.storeVecOutValid && UIntToOH(req.portIndex)(index) + + wb.bits.mBIndex := req.mbIndex + wb.bits.hit := true.B + wb.bits.isvec := true.B + wb.bits.sourceType := RSFeedbackType.tlbMiss + wb.bits.flushState := DontCare + wb.bits.trigger := TriggerAction.None + wb.bits.mmio := globalMMIO + wb.bits.exceptionVec := ExceptionNO.selectByFu(exceptionVec, VstuCfg) + wb.bits.usSecondInv := req.usSecondInv + wb.bits.vecFeedback := true.B + wb.bits.elemIdx := req.elemIdx + wb.bits.alignedType := req.alignedType + wb.bits.mask := req.mask + wb.bits.vaddr := req.vaddr + wb.bits.vaNeedExt := req.vaNeedExt + wb.bits.gpaddr := req.gpaddr + wb.bits.isForVSnonLeafPTE := req.isForVSnonLeafPTE + wb.bits.vstart := req.uop.vpu.vstart + wb.bits.vecTriggerMask := 0.U + wb.bits.nc := false.B + } + } val flush = req_valid && req.uop.robIdx.needFlush(io.redirect) - when (flush && (bufferState =/= s_idle)) { + when (flush) { bufferState := s_idle - req_valid := false.B + req_valid := Mux(cross4KBPageEnq && cross4KBPageBoundary && !reqRedirect, req_valid, false.B) curPtr := 0.U unSentStores := 0.U unWriteStores := 0.U globalException := false.B globalMMIO := false.B + isCrossPage := false.B + needFlushPipe := false.B } // NOTE: spectial case (unaligned store cross page, page fault happens in next page) @@ -607,7 +631,9 @@ class StoreMisalignBuffer(implicit p: Parameters) extends XSModule val overwriteGpaddr = RegEnable(splitStoreResp(curPtr).gpaddr, shouldOverwrite) val overwriteIsForVSnonLeafPTE = RegEnable(splitStoreResp(curPtr).isForVSnonLeafPTE, shouldOverwrite) - io.overwriteExpBuf.valid := overwriteExpBuf + //TODO In theory, there is no need to overwrite, but for now, the signal is retained in the code in this way. + // and the signal will be removed after sufficient verification. + io.overwriteExpBuf.valid := false.B io.overwriteExpBuf.vaddr := overwriteVaddr io.overwriteExpBuf.isHyper := overwriteIsHyper io.overwriteExpBuf.gpaddr := overwriteGpaddr diff --git a/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala b/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala index 9e9a76d8b2..2de1b718d0 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala @@ -67,6 +67,7 @@ class DataBufferEntry (implicit p: Parameters) extends DCacheBundle { val sqPtr = new SqPtr val prefetch = Bool() val vecValid = Bool() + val sqNeedDeq = Bool() } class StoreExceptionBuffer(implicit p: Parameters) extends XSModule with HasCircularQueuePtrHelper { @@ -79,7 +80,6 @@ class StoreExceptionBuffer(implicit p: Parameters) extends XSModule with HasCirc val io = IO(new Bundle() { val redirect = Flipped(ValidIO(new Redirect)) val storeAddrIn = Vec(enqPortNum, Flipped(ValidIO(new LsPipelineBundle()))) - val flushFrmMaBuf = Input(Bool()) val exceptionAddr = new ExceptionAddrIO }) @@ -108,7 +108,7 @@ class StoreExceptionBuffer(implicit p: Parameters) extends XSModule with HasCirc when (req_valid && req.uop.robIdx.needFlush(io.redirect)) { req_valid := s2_enqueue.asUInt.orR }.elsewhen (s2_enqueue.asUInt.orR) { - req_valid := req_valid || true.B + req_valid := true.B } def selectOldest[T <: LsPipelineBundle](valid: Seq[Bool], bits: Seq[T]): (Seq[Bool], Seq[T]) = { @@ -152,9 +152,6 @@ class StoreExceptionBuffer(implicit p: Parameters) extends XSModule with HasCirc io.exceptionAddr.vl := req.uop.vpu.vl io.exceptionAddr.isForVSnonLeafPTE := req.isForVSnonLeafPTE - when(req_valid && io.flushFrmMaBuf) { - req_valid := false.B - } } // Store Queue @@ -260,6 +257,7 @@ class StoreQueue(implicit p: Parameters) extends XSModule val allvalid = VecInit((0 until StoreQueueSize).map(i => addrvalid(i) && datavalid(i))) val committed = RegInit(VecInit(List.fill(StoreQueueSize)(false.B))) // inst has been committed by rob val unaligned = RegInit(VecInit(List.fill(StoreQueueSize)(false.B))) // unaligned store + val cross16Byte = RegInit(VecInit(List.fill(StoreQueueSize)(false.B))) // unaligned cross 16Byte boundary val pending = RegInit(VecInit(List.fill(StoreQueueSize)(false.B))) // mmio pending: inst is an mmio inst, it will not be executed until it reachs the end of rob val nc = RegInit(VecInit(List.fill(StoreQueueSize)(false.B))) // nc: inst is a nc inst val mmio = RegInit(VecInit(List.fill(StoreQueueSize)(false.B))) // mmio: inst is an mmio inst @@ -295,12 +293,6 @@ class StoreQueue(implicit p: Parameters) extends XSModule val commitCount = WireInit(0.U(log2Ceil(CommitWidth + 1).W)) val scommit = GatedRegNext(io.rob.scommit) - - // RegNext misalign control for better timing - val doMisalignSt = GatedValidRegNext((rdataPtrExt(0).value === deqPtr) && (cmtPtr === deqPtr) && allocated(deqPtr) && datavalid(deqPtr) && unaligned(deqPtr)) - val finishMisalignSt = GatedValidRegNext(doMisalignSt && io.maControl.control.removeSq && !io.maControl.control.hasException) - val misalignBlock = doMisalignSt && !finishMisalignSt - val mmioReq = Wire(chiselTypeOf(io.uncache.req)) val ncReq = Wire(chiselTypeOf(io.uncache.req)) val ncResp = Wire(chiselTypeOf(io.uncache.resp)) @@ -311,11 +303,6 @@ class StoreQueue(implicit p: Parameters) extends XSModule // TODO lyq: to eliminate coupling by passing signals through ubuffer val ncDeqTrigger = Mux(io.uncacheOutstanding, RegNext(RegNext(ncDoReq)), ncDoResp) val ncPtr = Mux(io.uncacheOutstanding, RegNext(RegNext(io.uncache.req.bits.id)), io.uncache.resp.bits.id) - - // store miss align info - io.maControl.storeInfo.data := dataModule.io.rdata(0).data - io.maControl.storeInfo.dataReady := doMisalignSt - io.maControl.storeInfo.completeSbTrans := doMisalignSt && dataBuffer.io.enq(0).fire // store can be committed by ROB io.rob.mmio := DontCare @@ -326,7 +313,7 @@ class StoreQueue(implicit p: Parameters) extends XSModule // rdataPtrExtNext and rdataPtrExtNext+1 entry will be read from dataModule val rdataPtrExtNext = Wire(Vec(EnsbufferWidth, new SqPtr)) rdataPtrExtNext := rdataPtrExt.map(i => i + - PopCount(dataBuffer.io.enq.map(_.fire)) + + PopCount(dataBuffer.io.enq.map(x=> x.fire && x.bits.sqNeedDeq)) + PopCount(ncReadNextTrigger || io.mmioStout.fire || io.vecmmioStout.fire) ) @@ -339,14 +326,16 @@ class StoreQueue(implicit p: Parameters) extends XSModule // // Modify deqPtrExtNext and io.sqDeq with care! val deqPtrExtNext = Wire(Vec(EnsbufferWidth, new SqPtr)) + // Only sqNeedDeq can move the ptr deqPtrExtNext := deqPtrExt.map(i => i + - RegNext(PopCount(VecInit(io.sbuffer.map(_.fire)))) + + RegNext(PopCount(VecInit(io.sbuffer.map(x=> x.fire && x.bits.sqNeedDeq)))) + PopCount(ncDeqTrigger || io.mmioStout.fire || io.vecmmioStout.fire) ) + val deqPtrExtNextCount = RegNext(PopCount(VecInit(io.sbuffer.map(_.bits.sqNeedDeq)))) io.sqDeq := RegNext( - RegNext(PopCount(VecInit(io.sbuffer.map(_.fire && !misalignBlock)))) + - PopCount(ncDeqTrigger || io.mmioStout.fire || io.vecmmioStout.fire || finishMisalignSt) + RegNext(PopCount(VecInit(io.sbuffer.map(x=> x.fire && x.bits.sqNeedDeq)))) + + PopCount(ncDeqTrigger || io.mmioStout.fire || io.vecmmioStout.fire) ) assert(!RegNext(RegNext(io.sbuffer(0).fire) && (io.mmioStout.fire || io.vecmmioStout.fire))) @@ -388,6 +377,7 @@ class StoreQueue(implicit p: Parameters) extends XSModule datavalid((index + j.U).value) := false.B addrvalid((index + j.U).value) := false.B unaligned((index + j.U).value) := false.B + cross16Byte((index + j.U).value) := false.B committed((index + j.U).value) := false.B pending((index + j.U).value) := false.B prefetch((index + j.U).value) := false.B @@ -489,12 +479,16 @@ class StoreQueue(implicit p: Parameters) extends XSModule exceptionBuffer.io.storeAddrIn(StorePipelineWidth + i).valid := false.B exceptionBuffer.io.storeAddrIn(StorePipelineWidth + i).bits := 0.U.asTypeOf(new LsPipelineBundle) - when (io.storeAddrIn(i).fire) { + when (io.storeAddrIn(i).fire && io.storeAddrIn(i).bits.updateAddrValid) { val addr_valid = !io.storeAddrIn(i).bits.miss addrvalid(stWbIndex) := addr_valid //!io.storeAddrIn(i).bits.mmio nc(stWbIndex) := io.storeAddrIn(i).bits.nc + + } + when (io.storeAddrIn(i).fire && io.storeAddrIn(i).bits.isFinalSplit) { // pending(stWbIndex) := io.storeAddrIn(i).bits.mmio - unaligned(stWbIndex) := io.storeAddrIn(i).bits.uop.exceptionVec(storeAddrMisaligned) && !io.storeAddrIn(i).bits.isvec + unaligned(stWbIndex) := io.storeAddrIn(i).bits.isMisalign + cross16Byte(stWbIndex) := io.storeAddrIn(i).bits.isMisalign && !io.storeAddrIn(i).bits.misalignWith16Byte paddrModule.io.waddr(i) := stWbIndex paddrModule.io.wdata(i) := io.storeAddrIn(i).bits.paddr @@ -531,7 +525,7 @@ class StoreQueue(implicit p: Parameters) extends XSModule // re-replinish mmio, for pma/pmp will get mmio one cycle later val storeAddrInFireReg = RegNext(io.storeAddrIn(i).fire && !io.storeAddrIn(i).bits.miss) //val stWbIndexReg = RegNext(stWbIndex) - val stWbIndexReg = RegEnable(stWbIndex, io.storeAddrIn(i).fire) + val stWbIndexReg = RegEnable(stWbIndex, io.storeAddrIn(i).fire && io.storeAddrIn(i).bits.updateAddrValid) when (storeAddrInFireReg) { pending(stWbIndexReg) := io.storeAddrInRe(i).mmio mmio(stWbIndexReg) := io.storeAddrInRe(i).mmio @@ -548,7 +542,7 @@ class StoreQueue(implicit p: Parameters) extends XSModule // enter exceptionbuffer again when (storeAddrInFireReg) { exceptionBuffer.io.storeAddrIn(StorePipelineWidth + i).valid := io.storeAddrInRe(i).af && !io.storeAddrInRe(i).isvec - exceptionBuffer.io.storeAddrIn(StorePipelineWidth + i).bits := RegEnable(io.storeAddrIn(i).bits, io.storeAddrIn(i).fire && !io.storeAddrIn(i).bits.miss) + exceptionBuffer.io.storeAddrIn(StorePipelineWidth + i).bits := io.storeAddrInRe(i) exceptionBuffer.io.storeAddrIn(StorePipelineWidth + i).bits.uop.exceptionVec(storeAccessFault) := io.storeAddrInRe(i).af } @@ -687,6 +681,12 @@ class StoreQueue(implicit p: Parameters) extends XSModule val dataInvalidMask = dataInvalidMask1 | dataInvalidMask2 io.forward(i).dataInvalidFast := dataInvalidMask.orR + //TODO If the previous store appears out of alignment, then simply FF, this is a very unreasonable way to do it. + //TODO But for the time being, this is the way to ensure correctness. Such a suitable opportunity to support unaligned forward. + val unalignedMask1 = unaligned.asUInt & forwardMask1.asUInt & allocated.asUInt + val unalignedMask2 = unaligned.asUInt & forwardMask2.asUInt & allocated.asUInt + val forwardPreWithUnaligned = (unalignedMask1 | unalignedMask2).asUInt.orR + // make chisel happy val dataInvalidMask1Reg = Wire(UInt(StoreQueueSize.W)) dataInvalidMask1Reg := RegNext(dataInvalidMask1) @@ -708,7 +708,7 @@ class StoreQueue(implicit p: Parameters) extends XSModule val addrInvalidMaskReg = addrInvalidMask1Reg | addrInvalidMask2Reg // load_s2 - io.forward(i).dataInvalid := RegNext(io.forward(i).dataInvalidFast) + io.forward(i).dataInvalid := RegNext(io.forward(i).dataInvalidFast) || RegNext(forwardPreWithUnaligned) // check if vaddr forward mismatched io.forward(i).matchInvalid := vaddrMatchFailed @@ -895,7 +895,7 @@ class StoreQueue(implicit p: Parameters) extends XSModule ncReq.bits.atomic := atomic(GatedRegNext(rdataPtrExtNext(0)).value) ncReq.bits.nc := true.B ncReq.bits.id := rptr0 - + ncResp.ready := io.uncache.resp.ready ncResp.valid := io.uncache.resp.fire && io.uncache.resp.bits.nc ncResp.bits <> io.uncache.resp.bits @@ -903,7 +903,7 @@ class StoreQueue(implicit p: Parameters) extends XSModule allocated(ncPtr) := false.B XSDebug("nc fire: ptr %d\n", ncPtr) } - + mmioReq.ready := io.uncache.req.ready ncReq.ready := io.uncache.req.ready && !mmioReq.valid io.uncache.req.valid := mmioReq.valid || ncReq.valid @@ -1018,7 +1018,6 @@ class StoreQueue(implicit p: Parameters) extends XSModule // don't mark misalign store as committed when ( allocated(cmtPtrExt(i).value) && - !unaligned(cmtPtrExt(i).value) && isNotAfter(uop(cmtPtrExt(i).value).robIdx, GatedRegNext(io.rob.pendingPtr)) && !needCancel(cmtPtrExt(i).value) && (!waitStoreS2(cmtPtrExt(i).value) || isVec(cmtPtrExt(i).value))) { @@ -1044,15 +1043,28 @@ class StoreQueue(implicit p: Parameters) extends XSModule /** * committed stores will not be cancelled and can be sent to lower level. - * + * * 1. Store NC: Read data to uncache * implement as above - * + * * 2. Store Cache: Read data from data module * remove retired insts from sq, add retired store to sbuffer. * as store queue grows larger and larger, time needed to read data from data * module keeps growing higher. Now we give data read a whole cycle. */ + + //TODO An unaligned command can only be sent out if the databuffer can enter more than two. + //TODO For now, hardcode the number of ENQs for the databuffer. + val canDeqMisaligned = dataBuffer.io.enq(0).ready && dataBuffer.io.enq(1).ready + val firstWithMisalign = unaligned(rdataPtrExt(0).value) + val firstWithCross16Byte = cross16Byte(rdataPtrExt(0).value) + + val isCross4KPage = io.maControl.toStoreQueue.crossPageWithHit + val isCross4KPageCanDeq = io.maControl.toStoreQueue.crossPageCanDeq + // When encountering a cross page store, a request needs to be sent to storeMisalignBuffer for the high page table's paddr. + io.maControl.toStoreMisalignBuffer.sqPtr := rdataPtrExt(0) + io.maControl.toStoreMisalignBuffer.doDeq := isCross4KPage && isCross4KPageCanDeq && dataBuffer.io.enq(0).fire + io.maControl.toStoreMisalignBuffer.uop := uop(rdataPtrExt(0).value) for (i <- 0 until EnsbufferWidth) { val ptr = rdataPtrExt(i).value val mmioStall = if(i == 0) mmio(rdataPtrExt(0).value) else (mmio(rdataPtrExt(i).value) || mmio(rdataPtrExt(i-1).value)) @@ -1063,31 +1075,137 @@ class StoreQueue(implicit p: Parameters) extends XSModule val vecNotAllMask = dataModule.io.rdata(i).mask.orR // Vector instructions that prevent triggered exceptions from being written to the 'databuffer'. val vecHasExceptionFlagValid = vecExceptionFlag.valid && isVec(ptr) && vecExceptionFlag.bits.robIdx === uop(ptr).robIdx - if (i == 0) { - // use dataBuffer write port 0 to writeback missaligned store out - dataBuffer.io.enq(i).valid := Mux( - doMisalignSt, - io.maControl.control.writeSb, - allocated(ptr) && committed(ptr) && ((!isVec(ptr) && (allvalid(ptr) || hasException(ptr))) || vecMbCommit(ptr)) && !mmioStall && !ncStall - ) - } else { - dataBuffer.io.enq(i).valid := Mux( - doMisalignSt, - false.B, - allocated(ptr) && committed(ptr) && ((!isVec(ptr) && (allvalid(ptr) || hasException(ptr))) || vecMbCommit(ptr)) && !mmioStall && !ncStall - ) + + // Only the first interface can write unaligned directives. + // Simplified design, even if the two ports have exceptions, but still only one unaligned dequeue. + val assert_flag = WireInit(false.B) + when(firstWithMisalign && firstWithCross16Byte) { + dataBuffer.io.enq(0).valid := canDeqMisaligned && allocated(rdataPtrExt(0).value) && committed(rdataPtrExt(0).value) && + ((!isVec(rdataPtrExt(0).value) && allvalid(rdataPtrExt(0).value) || vecMbCommit(rdataPtrExt(0).value)) && + (!isCross4KPage || isCross4KPageCanDeq) || hasException(rdataPtrExt(0).value)) && !ncStall + + dataBuffer.io.enq(1).valid := canDeqMisaligned && allocated(rdataPtrExt(0).value) && committed(rdataPtrExt(0).value) && + (!isVec(rdataPtrExt(0).value) && allvalid(rdataPtrExt(0).value) || vecMbCommit(rdataPtrExt(0).value)) && + (!isCross4KPage || isCross4KPageCanDeq) && !hasException(rdataPtrExt(0).value) && !ncStall + assert_flag := dataBuffer.io.enq(1).valid + }.otherwise { + if (i == 0) { + dataBuffer.io.enq(i).valid := ( + allocated(ptr) && committed(ptr) + && ((!isVec(ptr) && (allvalid(ptr) || hasException(ptr))) || vecMbCommit(ptr)) + && !mmioStall && !ncStall + && (!unaligned(ptr) || !cross16Byte(ptr) && (allvalid(ptr) || hasException(ptr))) + ) + } + else { + dataBuffer.io.enq(i).valid := ( + allocated(ptr) && committed(ptr) + && ((!isVec(ptr) && (allvalid(ptr) || hasException(ptr))) || vecMbCommit(ptr)) + && !mmioStall && !ncStall + && (!unaligned(ptr) || !cross16Byte(ptr) && (allvalid(ptr) || hasException(ptr))) + ) + } } + + val misalignAddrLow = vaddrModule.io.rdata(0)(2, 0) + val cross16ByteAddrLow4bit = vaddrModule.io.rdata(0)(3, 0) + val addrLow4bit = vaddrModule.io.rdata(i)(3, 0) + + // For unaligned, we need to generate a base-aligned mask in storeunit and then do a shift split in StoreQueue. + val Cross16ByteMask = Wire(UInt(32.W)) + val Cross16ByteData = Wire(UInt(256.W)) + Cross16ByteMask := dataModule.io.rdata(0).mask << cross16ByteAddrLow4bit + Cross16ByteData := dataModule.io.rdata(0).data << (cross16ByteAddrLow4bit << 3) + + val paddrLow = Cat(paddrModule.io.rdata(0)(paddrModule.io.rdata(0).getWidth - 1, 3), 0.U(3.W)) + val paddrHigh = Cat(paddrModule.io.rdata(0)(paddrModule.io.rdata(0).getWidth - 1, 3), 0.U(3.W)) + 8.U + + val vaddrLow = Cat(vaddrModule.io.rdata(0)(vaddrModule.io.rdata(0).getWidth - 1, 3), 0.U(3.W)) + val vaddrHigh = Cat(vaddrModule.io.rdata(0)(vaddrModule.io.rdata(0).getWidth - 1, 3), 0.U(3.W)) + 8.U + + val maskLow = Cross16ByteMask(15, 0) + val maskHigh = Cross16ByteMask(31, 16) + + val dataLow = Cross16ByteData(127, 0) + val dataHigh = Cross16ByteData(255, 128) + + when(canDeqMisaligned && firstWithMisalign && firstWithCross16Byte) { + when(isCross4KPage && isCross4KPageCanDeq) { + if (i == 0) { + dataBuffer.io.enq(i).bits.addr := paddrLow + dataBuffer.io.enq(i).bits.vaddr := vaddrLow + dataBuffer.io.enq(i).bits.data := dataLow + dataBuffer.io.enq(i).bits.mask := maskLow + dataBuffer.io.enq(i).bits.wline := false.B + dataBuffer.io.enq(i).bits.sqPtr := rdataPtrExt(0) + dataBuffer.io.enq(i).bits.prefetch := false.B + dataBuffer.io.enq(i).bits.sqNeedDeq := true.B + dataBuffer.io.enq(i).bits.vecValid := (!isVec(ptr) || (vecDataValid(ptr) && vecNotAllMask)) && !exceptionValid && !vecHasExceptionFlagValid + } + else { + dataBuffer.io.enq(i).bits.addr := io.maControl.toStoreQueue.paddr + dataBuffer.io.enq(i).bits.vaddr := vaddrHigh + dataBuffer.io.enq(i).bits.data := dataHigh + dataBuffer.io.enq(i).bits.mask := maskHigh + dataBuffer.io.enq(i).bits.wline := false.B + dataBuffer.io.enq(i).bits.sqPtr := rdataPtrExt(0) + dataBuffer.io.enq(i).bits.prefetch := false.B + dataBuffer.io.enq(i).bits.sqNeedDeq := false.B + dataBuffer.io.enq(i).bits.vecValid := (!isVec(ptr) || (vecDataValid(ptr) && vecNotAllMask)) && !exceptionValid && !vecHasExceptionFlagValid + } + } .otherwise { + if (i == 0) { + dataBuffer.io.enq(i).bits.addr := paddrLow + dataBuffer.io.enq(i).bits.vaddr := vaddrLow + dataBuffer.io.enq(i).bits.data := dataLow + dataBuffer.io.enq(i).bits.mask := maskLow + dataBuffer.io.enq(i).bits.wline := false.B + dataBuffer.io.enq(i).bits.sqPtr := rdataPtrExt(0) + dataBuffer.io.enq(i).bits.prefetch := false.B + dataBuffer.io.enq(i).bits.sqNeedDeq := true.B + dataBuffer.io.enq(i).bits.vecValid := (!isVec(ptr) || (vecDataValid(ptr) && vecNotAllMask)) && !exceptionValid && !vecHasExceptionFlagValid + } + else { + dataBuffer.io.enq(i).bits.addr := paddrHigh + dataBuffer.io.enq(i).bits.vaddr := vaddrHigh + dataBuffer.io.enq(i).bits.data := dataHigh + dataBuffer.io.enq(i).bits.mask := maskHigh + dataBuffer.io.enq(i).bits.wline := false.B + dataBuffer.io.enq(i).bits.sqPtr := rdataPtrExt(0) + dataBuffer.io.enq(i).bits.prefetch := false.B + dataBuffer.io.enq(i).bits.sqNeedDeq := false.B + dataBuffer.io.enq(i).bits.vecValid := (!isVec(ptr) || (vecDataValid(ptr) && vecNotAllMask)) && !exceptionValid && !vecHasExceptionFlagValid + } + } + + + }.elsewhen(!cross16Byte(ptr) && unaligned(ptr)) { + dataBuffer.io.enq(i).bits.addr := Cat(paddrModule.io.rdata(i)(PAddrBits - 1, 4), 0.U(4.W)) + dataBuffer.io.enq(i).bits.vaddr := Cat(vaddrModule.io.rdata(i)(VAddrBits - 1, 4), 0.U(4.W)) + dataBuffer.io.enq(i).bits.data := dataModule.io.rdata(i).data << (addrLow4bit << 3) + dataBuffer.io.enq(i).bits.mask := dataModule.io.rdata(i).mask + dataBuffer.io.enq(i).bits.wline := paddrModule.io.rlineflag(i) + dataBuffer.io.enq(i).bits.sqPtr := rdataPtrExt(i) + dataBuffer.io.enq(i).bits.prefetch := prefetch(ptr) + dataBuffer.io.enq(i).bits.sqNeedDeq := true.B + // when scalar has exception, will also not write into sbuffer + dataBuffer.io.enq(i).bits.vecValid := (!isVec(ptr) || (vecDataValid(ptr) && vecNotAllMask)) && !exceptionValid && !vecHasExceptionFlagValid + }.otherwise { + dataBuffer.io.enq(i).bits.addr := paddrModule.io.rdata(i) + dataBuffer.io.enq(i).bits.vaddr := vaddrModule.io.rdata(i) + dataBuffer.io.enq(i).bits.data := dataModule.io.rdata(i).data + dataBuffer.io.enq(i).bits.mask := dataModule.io.rdata(i).mask + dataBuffer.io.enq(i).bits.wline := paddrModule.io.rlineflag(i) + dataBuffer.io.enq(i).bits.sqPtr := rdataPtrExt(i) + dataBuffer.io.enq(i).bits.prefetch := prefetch(ptr) + dataBuffer.io.enq(i).bits.sqNeedDeq := true.B + // when scalar has exception, will also not write into sbuffer + dataBuffer.io.enq(i).bits.vecValid := (!isVec(ptr) || (vecDataValid(ptr) && vecNotAllMask)) && !exceptionValid && !vecHasExceptionFlagValid + + } + // Note that store data/addr should both be valid after store's commit - assert(!dataBuffer.io.enq(i).valid || allvalid(ptr) || doMisalignSt || hasException(ptr) || (allocated(ptr) && vecMbCommit(ptr))) - dataBuffer.io.enq(i).bits.addr := Mux(doMisalignSt, io.maControl.control.paddr, paddrModule.io.rdata(i)) - dataBuffer.io.enq(i).bits.vaddr := Mux(doMisalignSt, io.maControl.control.vaddr, vaddrModule.io.rdata(i)) - dataBuffer.io.enq(i).bits.data := Mux(doMisalignSt, io.maControl.control.wdata, dataModule.io.rdata(i).data) - dataBuffer.io.enq(i).bits.mask := Mux(doMisalignSt, io.maControl.control.wmask, dataModule.io.rdata(i).mask) - dataBuffer.io.enq(i).bits.wline := Mux(doMisalignSt, false.B, paddrModule.io.rlineflag(i)) - dataBuffer.io.enq(i).bits.sqPtr := rdataPtrExt(i) - dataBuffer.io.enq(i).bits.prefetch := Mux(doMisalignSt, false.B, prefetch(ptr)) - // when scalar has exception, will also not write into sbuffer - dataBuffer.io.enq(i).bits.vecValid := Mux(doMisalignSt, true.B, (!isVec(ptr) || (vecDataValid(ptr) && vecNotAllMask)) && !exceptionValid && !vecHasExceptionFlagValid) + assert(!dataBuffer.io.enq(i).valid || allvalid(ptr) || hasException(ptr) || (allocated(ptr) && vecMbCommit(ptr)) || assert_flag) // dataBuffer.io.enq(i).bits.vecValid := (!isVec(ptr) || vecDataValid(ptr)) && !hasException(ptr) } @@ -1104,12 +1222,13 @@ class StoreQueue(implicit p: Parameters) extends XSModule io.sbuffer(i).bits.wline := dataBuffer.io.deq(i).bits.wline && dataBuffer.io.deq(i).bits.vecValid io.sbuffer(i).bits.prefetch := dataBuffer.io.deq(i).bits.prefetch io.sbuffer(i).bits.vecValid := dataBuffer.io.deq(i).bits.vecValid + io.sbuffer(i).bits.sqNeedDeq := dataBuffer.io.deq(i).bits.sqNeedDeq // io.sbuffer(i).fire is RegNexted, as sbuffer data write takes 2 cycles. // Before data write finish, sbuffer is unable to provide store to load // forward data. As an workaround, deqPtrExt and allocated flag update // is delayed so that load can get the right data from store queue. val ptr = dataBuffer.io.deq(i).bits.sqPtr.value - when (RegNext(io.sbuffer(i).fire && !doMisalignSt)) { + when (RegNext(io.sbuffer(i).fire && io.sbuffer(i).bits.sqNeedDeq)) { allocated(RegEnable(ptr, io.sbuffer(i).fire)) := false.B XSDebug("sbuffer "+i+" fire: ptr %d\n", ptr) } @@ -1236,6 +1355,12 @@ class StoreQueue(implicit p: Parameters) extends XSModule } } + // For vector, when there is a store across pages with the same uop in storeMisalignBuffer, storequeue needs to mark this item as committed. + // TODO FIXME Can vecMbCommit be removed? + when(io.maControl.toStoreQueue.withSameUop && allvalid(rdataPtrExt(0).value)) { + vecMbCommit(rdataPtrExt(0).value) := true.B + } + // misprediction recovery / exception redirect // invalidate sq term using robIdx for (i <- 0 until StoreQueueSize) { @@ -1274,23 +1399,6 @@ class StoreQueue(implicit p: Parameters) extends XSModule } assert(!(lastCycleRedirect && enqNumber =/= 0.U)) - exceptionBuffer.io.flushFrmMaBuf := finishMisalignSt - // special case (store miss align) in updating ptr - when (doMisalignSt) { - when (!finishMisalignSt) { - // dont move deqPtr and rdataPtr until all split store has been written to sb - deqPtrExtNext := deqPtrExt - rdataPtrExtNext := rdataPtrExt - } .otherwise { - // remove this unaligned store from sq - allocated(deqPtr) := false.B - committed(deqPtr) := true.B - cmtPtrExt := cmtPtrExt.map(_ + 1.U) - deqPtrExtNext := deqPtrExt.map(_ + 1.U) - rdataPtrExtNext := rdataPtrExt.map(_ + 1.U) - } - } - deqPtrExt := deqPtrExtNext rdataPtrExt := rdataPtrExtNext diff --git a/src/main/scala/xiangshan/mem/lsqueue/VirtualLoadQueue.scala b/src/main/scala/xiangshan/mem/lsqueue/VirtualLoadQueue.scala index d574adee0c..13e4451287 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/VirtualLoadQueue.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/VirtualLoadQueue.scala @@ -245,8 +245,9 @@ class VirtualLoadQueue(implicit p: Parameters) extends XSModule when (io.ldin(i).valid) { val hasExceptions = ExceptionNO.selectByFu(io.ldin(i).bits.uop.exceptionVec, LduCfg).asUInt.orR val need_rep = io.ldin(i).bits.rep_info.need_rep + val need_valid = io.ldin(i).bits.updateAddrValid - when (!need_rep) { + when (!need_rep && need_valid) { // update control flag addrvalid(loadWbIndex) := hasExceptions || !io.ldin(i).bits.tlbMiss || io.ldin(i).bits.isSWPrefetch datavalid(loadWbIndex) := diff --git a/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala b/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala index 130e688adf..c0eb7c4854 100644 --- a/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala +++ b/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala @@ -71,6 +71,7 @@ class LoadToLsqReplayIO(implicit p: Parameters) extends XSBundle def bank_conflict = cause(LoadReplayCauses.C_BC) def rar_nack = cause(LoadReplayCauses.C_RAR) def raw_nack = cause(LoadReplayCauses.C_RAW) + def misalign_nack = cause(LoadReplayCauses.C_MF) def nuke = cause(LoadReplayCauses.C_NK) def need_rep = cause.asUInt.orR } @@ -195,7 +196,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule val fast_rep_out = Decoupled(new LqWriteBundle) // to misalign buffer - val misalign_buf = Valid(new LqWriteBundle) + val misalign_buf = Decoupled(new LqWriteBundle) // Load RAR rollback val rollback = Valid(new Redirect) @@ -216,6 +217,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule val s0_valid = Wire(Bool()) val s0_mmio_select = Wire(Bool()) val s0_nc_select = Wire(Bool()) + val s0_misalign_select= Wire(Bool()) val s0_kill = Wire(Bool()) val s0_can_go = s1_ready val s0_fire = s0_valid && s0_can_go @@ -228,6 +230,8 @@ class LoadUnit(implicit p: Parameters) extends XSModule val s0_tlb_vaddr = Wire(UInt(VAddrBits.W)) val s0_tlb_fullva = Wire(UInt(XLEN.W)) val s0_dcache_vaddr = Wire(UInt(VAddrBits.W)) + val s0_is128bit = Wire(Bool()) + val s0_misalign_wakeup_fire = s0_misalign_select && s0_can_go && io.misalign_ldin.bits.misalignNeedWakeUp // flow source bundle class FlowSource extends Bundle { @@ -318,12 +322,6 @@ class LoadUnit(implicit p: Parameters) extends XSModule val s0_src_select_vec = WireInit(VecInit((0 until SRC_NUM).map{i => s0_src_valid_vec(i) && s0_src_ready_vec(i)})) val s0_hw_prf_select = s0_src_select_vec(high_pf_idx) || s0_src_select_vec(low_pf_idx) - if (backendParams.debugEn){ - dontTouch(s0_src_valid_vec) - dontTouch(s0_src_ready_vec) - dontTouch(s0_src_select_vec) - } - val s0_tlb_no_query = s0_hw_prf_select || s0_sel_src.prf_i || s0_src_select_vec(fast_rep_idx) || s0_src_select_vec(mmio_idx) || s0_src_select_vec(nc_idx) @@ -344,6 +342,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule //judgment: is NC with data or not. //If true, it's from `io.lsq.nc_ldin` or `io.fast_rep_in` val s0_nc_with_data = s0_sel_src.isnc && !s0_kill + s0_misalign_select := s0_src_select_vec(mab_idx) && !s0_kill // if is hardware prefetch or fast replay, don't send valid to tlb s0_tlb_valid := ( @@ -404,7 +403,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule io.dcache.req.bits.id := DontCare // TODO: update cache meta io.dcache.req.bits.lqIdx := s0_sel_src.uop.lqIdx io.dcache.pf_source := Mux(s0_hw_prf_select, io.prefetch_req.bits.pf_source.value, L1_HW_PREFETCH_NULL) - io.dcache.is128Req := s0_sel_src.is128bit + io.dcache.is128Req := s0_is128bit // load flow priority mux def fromNullSource(): FlowSource = { @@ -430,7 +429,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule out.prf_rd := false.B out.prf_wr := false.B out.sched_idx := src.schedIndex - out.isvec := false.B + out.isvec := src.isvec out.is128bit := src.is128bit out.vecActive := true.B out @@ -686,6 +685,37 @@ class LoadUnit(implicit p: Parameters) extends XSModule int_vec_vaddr ) ) + s0_dcache_vaddr := Mux( + s0_src_select_vec(fast_rep_idx), io.fast_rep_in.bits.vaddr, + Mux(s0_hw_prf_select, io.prefetch_req.bits.getVaddr(), + Mux(s0_src_select_vec(nc_idx), io.lsq.nc_ldin.bits.vaddr, // not for dcache access, but for address alignment check + s0_tlb_vaddr)) + ) + + val s0_alignType = Mux(s0_sel_src.isvec, s0_sel_src.alignedType(1,0), s0_sel_src.uop.fuOpType(1, 0)) + + val s0_addr_aligned = LookupTree(s0_alignType, List( + "b00".U -> true.B, //b + "b01".U -> (s0_dcache_vaddr(0) === 0.U), //h + "b10".U -> (s0_dcache_vaddr(1, 0) === 0.U), //w + "b11".U -> (s0_dcache_vaddr(2, 0) === 0.U) //d + )) + // address align check + XSError(s0_sel_src.isvec && s0_dcache_vaddr(3, 0) =/= 0.U && s0_sel_src.alignedType(2), "unit-stride 128 bit element is not aligned!") + + val s0_check_vaddr_low = s0_dcache_vaddr(4, 0) + val s0_check_vaddr_Up_low = LookupTree(s0_alignType, List( + "b00".U -> 0.U, + "b01".U -> 1.U, + "b10".U -> 3.U, + "b11".U -> 7.U + )) +& s0_check_vaddr_low + //TODO vec? + val s0_rs_cross16Bytes = s0_check_vaddr_Up_low(4) =/= s0_check_vaddr_low(4) + val s0_misalignWith16Byte = !s0_rs_cross16Bytes && !s0_addr_aligned && !s0_hw_prf_select + val s0_misalignNeedWakeUp = s0_sel_src.frm_mabuf && io.misalign_ldin.bits.misalignNeedWakeUp + val s0_finalSplit = s0_sel_src.frm_mabuf && io.misalign_ldin.bits.isFinalSplit + s0_is128bit := s0_sel_src.is128bit || s0_misalignWith16Byte // only first issue of int / vec load intructions need to check full vaddr s0_tlb_fullva := Mux(s0_src_valid_vec(mab_idx), @@ -700,12 +730,6 @@ class LoadUnit(implicit p: Parameters) extends XSModule ) ) - s0_dcache_vaddr := - Mux(s0_src_select_vec(fast_rep_idx), io.fast_rep_in.bits.vaddr, - Mux(s0_hw_prf_select, io.prefetch_req.bits.getVaddr(), - Mux(s0_src_select_vec(nc_idx), io.lsq.nc_ldin.bits.vaddr, // not for dcache access, but for address alignment check - s0_tlb_vaddr))) - s0_tlb_hlv := Mux( s0_src_valid_vec(mab_idx), LSUOpType.isHlv(io.misalign_ldin.bits.uop.fuOpType), @@ -733,15 +757,6 @@ class LoadUnit(implicit p: Parameters) extends XSModule ) ) - // address align check - val s0_addr_aligned = LookupTree(Mux(s0_sel_src.isvec, s0_sel_src.alignedType(1,0), s0_sel_src.uop.fuOpType(1, 0)), List( - "b00".U -> true.B, //b - "b01".U -> (s0_dcache_vaddr(0) === 0.U), //h - "b10".U -> (s0_dcache_vaddr(1, 0) === 0.U), //w - "b11".U -> (s0_dcache_vaddr(2, 0) === 0.U) //d - )) - XSError(s0_sel_src.isvec && s0_dcache_vaddr(3, 0) =/= 0.U && s0_sel_src.alignedType(2), "unit-stride 128 bit element is not aligned!") - // accept load flow if dcache ready (tlb is always ready) // TODO: prefetch need writeback to loadQueueFlag s0_out := DontCare @@ -758,10 +773,10 @@ class LoadUnit(implicit p: Parameters) extends XSModule s0_out.isFastPath := s0_sel_src.l2l_fwd s0_out.mshrid := s0_sel_src.mshrid s0_out.isvec := s0_sel_src.isvec - s0_out.is128bit := s0_sel_src.is128bit + s0_out.is128bit := s0_is128bit s0_out.isFrmMisAlignBuf := s0_sel_src.frm_mabuf s0_out.uop_unit_stride_fof := s0_sel_src.uop_unit_stride_fof - s0_out.paddr := + s0_out.paddr := Mux(s0_src_valid_vec(nc_idx), io.lsq.nc_ldin.bits.paddr, Mux(s0_src_valid_vec(fast_rep_idx), io.fast_rep_in.bits.paddr, Mux(s0_src_select_vec(int_iss_idx) && s0_sel_src.prf_i, 0.U, @@ -781,7 +796,9 @@ class LoadUnit(implicit p: Parameters) extends XSModule s0_out.mbIndex := s0_sel_src.mbIndex s0_out.vecBaseVaddr := s0_sel_src.vecBaseVaddr // s0_out.flowPtr := s0_sel_src.flowPtr - s0_out.uop.exceptionVec(loadAddrMisaligned) := (!s0_addr_aligned || s0_sel_src.uop.exceptionVec(loadAddrMisaligned)) && s0_sel_src.vecActive + s0_out.uop.exceptionVec(loadAddrMisaligned) := (!s0_addr_aligned || s0_sel_src.uop.exceptionVec(loadAddrMisaligned)) && s0_sel_src.vecActive && !s0_misalignWith16Byte + // TODO ??? + s0_out.isMisalign := (!s0_addr_aligned || s0_sel_src.uop.exceptionVec(loadAddrMisaligned)) && s0_sel_src.vecActive s0_out.forward_tlDchannel := s0_src_select_vec(super_rep_idx) when(io.tlb.req.valid && s0_sel_src.isFirstIssue) { s0_out.uop.debugInfo.tlbFirstReqTime := GTimer() @@ -792,6 +809,9 @@ class LoadUnit(implicit p: Parameters) extends XSModule //for Svpbmt Nc s0_out.nc := s0_sel_src.isnc s0_out.data := s0_sel_src.data + s0_out.misalignWith16Byte := s0_misalignWith16Byte + s0_out.misalignNeedWakeUp := s0_misalignNeedWakeUp + s0_out.isFinalSplit := s0_finalSplit // load fast replay io.fast_rep_in.ready := (s0_can_go && io.dcache.req.ready && s0_src_ready_vec(fast_rep_idx)) @@ -823,6 +843,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule // load wakeup // TODO: vector load wakeup? frm_mabuf wakeup? val s0_wakeup_selector = Seq( + s0_misalign_wakeup_fire, s0_src_valid_vec(super_rep_idx), s0_src_valid_vec(fast_rep_idx), s0_mmio_fire, @@ -831,6 +852,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule s0_src_valid_vec(int_iss_idx) ) val s0_wakeup_format = Seq( + io.misalign_ldin.bits.uop, io.replay.bits.uop, io.fast_rep_in.bits.uop, io.lsq.uncache.bits.uop, @@ -840,12 +862,12 @@ class LoadUnit(implicit p: Parameters) extends XSModule ) val s0_wakeup_uop = ParallelPriorityMux(s0_wakeup_selector, s0_wakeup_format) io.wakeup.valid := s0_fire && !s0_sel_src.isvec && !s0_sel_src.frm_mabuf && ( - s0_src_valid_vec(super_rep_idx) || - s0_src_valid_vec(fast_rep_idx) || - s0_src_valid_vec(lsq_rep_idx) || - (s0_src_valid_vec(int_iss_idx) && !s0_sel_src.prf && + s0_src_valid_vec(super_rep_idx) || + s0_src_valid_vec(fast_rep_idx) || + s0_src_valid_vec(lsq_rep_idx) || + (s0_src_valid_vec(int_iss_idx) && !s0_sel_src.prf && !s0_src_valid_vec(vec_iss_idx) && !s0_src_valid_vec(high_pf_idx)) - ) || s0_mmio_fire || s0_nc_fire + ) || s0_mmio_fire || s0_nc_fire || s0_misalign_wakeup_fire io.wakeup.bits := s0_wakeup_uop // prefetch.i(Zicbop) @@ -948,7 +970,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule // st-ld violation query // if store unit is 128-bits memory access, need match 128-bit - private val s1_isMatch128 = io.stld_nuke_query.map(x => (x.bits.matchLine || (s1_in.isvec && s1_in.is128bit))) + private val s1_isMatch128 = io.stld_nuke_query.map(x => (x.bits.matchLine || ((s1_in.isvec || s1_in.misalignWith16Byte) && s1_in.is128bit))) val s1_nuke_paddr_match = VecInit((0 until StorePipelineWidth).zip(s1_isMatch128).map{case (w, s) => {Mux(s, s1_paddr_dup_lsu(PAddrBits-1, 4) === io.stld_nuke_query(w).bits.paddr(PAddrBits-1, 4), s1_paddr_dup_lsu(PAddrBits-1, 3) === io.stld_nuke_query(w).bits.paddr(PAddrBits-1, 3))}}) @@ -982,7 +1004,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule s1_out.uop.exceptionVec(loadPageFault) := io.tlb.resp.bits.excp(0).pf.ld && s1_vecActive && !s1_tlb_miss && !s1_in.tlbNoQuery s1_out.uop.exceptionVec(loadGuestPageFault) := io.tlb.resp.bits.excp(0).gpf.ld && !s1_tlb_miss && !s1_in.tlbNoQuery s1_out.uop.exceptionVec(loadAccessFault) := io.tlb.resp.bits.excp(0).af.ld && s1_vecActive && !s1_tlb_miss && !s1_in.tlbNoQuery - when (!s1_out.isvec && RegNext(io.tlb.req.bits.checkfullva) && + when (RegNext(io.tlb.req.bits.checkfullva) && (s1_out.uop.exceptionVec(loadPageFault) || s1_out.uop.exceptionVec(loadGuestPageFault) || s1_out.uop.exceptionVec(loadAccessFault))) { @@ -1014,9 +1036,9 @@ class LoadUnit(implicit p: Parameters) extends XSModule s1_cancel_ptr_chasing || s1_in.uop.robIdx.needFlush(io.redirect) || (s1_in.uop.robIdx.needFlush(s1_redirect_reg) && !GatedValidRegNext(s0_try_ptr_chasing)) || - RegEnable(s0_kill, false.B, io.ldin.valid || - io.vecldin.valid || io.replay.valid || - io.l2l_fwd_in.valid || io.fast_rep_in.valid || + RegEnable(s0_kill, false.B, io.ldin.valid || + io.vecldin.valid || io.replay.valid || + io.l2l_fwd_in.valid || io.fast_rep_in.valid || io.misalign_ldin.valid || io.lsq.nc_ldin.valid ) @@ -1052,9 +1074,9 @@ class LoadUnit(implicit p: Parameters) extends XSModule s1_in.uop.debugInfo.tlbRespTime := GTimer() } when (!s1_cancel_ptr_chasing) { - s0_ptr_chasing_canceled := s1_try_ptr_chasing && - !io.replay.fire && !io.fast_rep_in.fire && - !(s0_src_valid_vec(high_pf_idx) && io.canAcceptHighConfPrefetch) && + s0_ptr_chasing_canceled := s1_try_ptr_chasing && + !io.replay.fire && !io.fast_rep_in.fire && + !(s0_src_valid_vec(high_pf_idx) && io.canAcceptHighConfPrefetch) && !io.misalign_ldin.fire && !io.lsq.nc_ldin.valid when (s1_try_ptr_chasing) { @@ -1145,7 +1167,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule s2_exception_vec(loadAccessFault) := s2_vecActive && ( s2_in.uop.exceptionVec(loadAccessFault) || s2_pmp.ld || - s2_isvec && s2_actually_uncache && !s2_prf && !s2_in.tlbMiss || + (s2_isvec || s2_frm_mabuf) && s2_actually_uncache && !s2_prf && !s2_in.tlbMiss || io.dcache.resp.bits.tag_error && GatedValidRegNext(io.csrCtrl.cache_error_enable) ) } @@ -1159,8 +1181,10 @@ class LoadUnit(implicit p: Parameters) extends XSModule } val s2_exception = s2_vecActive && (s2_trigger_debug_mode || ExceptionNO.selectByFu(s2_exception_vec, LduCfg).asUInt.orR) - val s2_mis_align = s2_valid && GatedValidRegNext(io.csrCtrl.hd_misalign_ld_enable) && !s2_in.isvec && - s2_exception_vec(loadAddrMisaligned) && !s2_exception_vec(breakPoint) && !s2_trigger_debug_mode + val s2_mis_align = s2_valid && GatedValidRegNext(io.csrCtrl.hd_misalign_ld_enable) && + s2_in.isMisalign && !s2_in.misalignWith16Byte && !s2_exception_vec(breakPoint) && !s2_trigger_debug_mode + val s2_only_misalign_exception = !ExceptionNO.selectByFuAndUnSelect(s2_exception_vec, LduCfg, Seq(loadAddrMisaligned)).asUInt.orR && !s2_trigger_debug_mode && + s2_vecActive && s2_exception_vec(loadAddrMisaligned) val (s2_fwd_frm_d_chan, s2_fwd_data_frm_d_chan) = io.tl_d_channel.forward(s1_valid && s1_out.forward_tlDchannel, s1_out.mshrid, s1_out.paddr) val (s2_fwd_data_valid, s2_fwd_frm_mshr, s2_fwd_data_frm_mshr) = io.forward_mshr.forward() val s2_fwd_frm_d_chan_or_mshr = s2_fwd_data_valid && (s2_fwd_frm_d_chan || s2_fwd_frm_mshr) @@ -1169,7 +1193,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule // * ecc data error is slow to generate, so we will not use it until load stage 3 // * in load stage 3, an extra signal io.load_error will be used to // * if pbmt =/= 0, mmio is up to pbmt; otherwise, it's up to pmp - val s2_mmio = !s2_prf && + val s2_mmio = !s2_prf && !s2_exception && !s2_in.tlbMiss && Mux(Pbmt.isUncache(s2_pbmt), s2_in.mmio, s2_pmp.mmio) val s2_uncache = !s2_prf && !s2_exception && !s2_in.tlbMiss && s2_actually_uncache @@ -1207,7 +1231,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule // 2. Load instruction is younger than requestors(store instructions). // 3. Physical address match. // 4. Data contains. - private val s2_isMatch128 = io.stld_nuke_query.map(x => (x.bits.matchLine || (s2_in.isvec && s2_in.is128bit))) + private val s2_isMatch128 = io.stld_nuke_query.map(x => (x.bits.matchLine || ((s2_in.isvec || s2_in.misalignWith16Byte) && s2_in.is128bit))) val s2_nuke_paddr_match = VecInit((0 until StorePipelineWidth).zip(s2_isMatch128).map{case (w, s) => {Mux(s, s2_in.paddr(PAddrBits-1, 4) === io.stld_nuke_query(w).bits.paddr(PAddrBits-1, 4), s2_in.paddr(PAddrBits-1, 3) === io.stld_nuke_query(w).bits.paddr(PAddrBits-1, 3))}}) @@ -1245,7 +1269,8 @@ class LoadUnit(implicit p: Parameters) extends XSModule !s2_tlb_miss && !s2_fwd_fail && (s2_dcache_fast_rep || s2_nuke_fast_rep) && - s2_troublem + s2_troublem && + !s2_in.misalignNeedWakeUp // need allocate new entry val s2_can_query = !s2_mem_amb && @@ -1258,8 +1283,8 @@ class LoadUnit(implicit p: Parameters) extends XSModule val s2_fwd_vp_match_invalid = io.lsq.forward.matchInvalid || io.sbuffer.matchInvalid || io.ubuffer.matchInvalid val s2_vp_match_fail = s2_fwd_vp_match_invalid && s2_troublem - val s2_safe_wakeup = !s2_out.rep_info.need_rep && !s2_mmio && (!s2_in.nc || s2_nc_with_data) && !s2_mis_align && !s2_exception // don't need to replay and is not a mmio\misalign no data - val s2_safe_writeback = s2_exception || s2_safe_wakeup || s2_vp_match_fail + val s2_safe_wakeup = !s2_out.rep_info.need_rep && !s2_mmio && (!s2_in.nc || s2_nc_with_data) && !s2_mis_align && !s2_exception || s2_in.misalignNeedWakeUp // don't need to replay and is not a mmio\misalign no data + val s2_safe_writeback = s2_exception || s2_safe_wakeup || s2_vp_match_fail || s2_in.misalignNeedWakeUp // ld-ld violation require io.lsq.ldld_nuke_query.req.valid := s2_valid && s2_can_query @@ -1285,8 +1310,8 @@ class LoadUnit(implicit p: Parameters) extends XSModule // generate XLEN/8 Muxs for (i <- 0 until VLEN / 8) { s2_fwd_mask(i) := io.lsq.forward.forwardMask(i) || io.sbuffer.forwardMask(i) || io.ubuffer.forwardMask(i) - s2_fwd_data(i) := - Mux(io.lsq.forward.forwardMask(i), io.lsq.forward.forwardData(i), + s2_fwd_data(i) := + Mux(io.lsq.forward.forwardMask(i), io.lsq.forward.forwardData(i), Mux(s2_nc_with_data, io.ubuffer.forwardData(i), io.sbuffer.forwardData(i))) } @@ -1377,6 +1402,11 @@ class LoadUnit(implicit p: Parameters) extends XSModule io.prefetch_train.bits.miss := RegEnable(io.dcache.resp.bits.miss, s2_prefetch_train_valid) // TODO: use trace with bank conflict? io.prefetch_train.bits.meta_prefetch := RegEnable(io.dcache.resp.bits.meta_prefetch, s2_prefetch_train_valid) io.prefetch_train.bits.meta_access := RegEnable(io.dcache.resp.bits.meta_access, s2_prefetch_train_valid) + io.prefetch_train.bits.isFinalSplit := false.B + io.prefetch_train.bits.misalignWith16Byte := false.B + io.prefetch_train.bits.misalignNeedWakeUp := false.B + io.prefetch_train.bits.updateAddrValid := false.B + io.prefetch_train.bits.isMisalign := false.B io.s1_prefetch_spec := s1_fire io.s2_prefetch_spec := s2_prefetch_train_valid @@ -1387,6 +1417,11 @@ class LoadUnit(implicit p: Parameters) extends XSModule io.prefetch_train_l1.bits.miss := RegEnable(io.dcache.resp.bits.miss, s2_prefetch_train_l1_valid) io.prefetch_train_l1.bits.meta_prefetch := RegEnable(io.dcache.resp.bits.meta_prefetch, s2_prefetch_train_l1_valid) io.prefetch_train_l1.bits.meta_access := RegEnable(io.dcache.resp.bits.meta_access, s2_prefetch_train_l1_valid) + io.prefetch_train_l1.bits.isFinalSplit := false.B + io.prefetch_train_l1.bits.misalignWith16Byte := false.B + io.prefetch_train_l1.bits.misalignNeedWakeUp := false.B + io.prefetch_train_l1.bits.updateAddrValid := false.B + io.prefetch_train_l1.bits.isMisalign := false.B if (env.FPGAPlatform){ io.dcache.s0_pc := DontCare io.dcache.s1_pc := DontCare @@ -1439,6 +1474,8 @@ class LoadUnit(implicit p: Parameters) extends XSModule val s3_exception = RegEnable(s2_exception, s2_fire) val s3_mis_align = RegEnable(s2_mis_align, s2_fire) val s3_trigger_debug_mode = RegEnable(s2_trigger_debug_mode, false.B, s2_fire) + val s3_onlyMisalignException = RegEnable(s2_only_misalign_exception, false.B, s2_fire) + // TODO: Fix vector load merge buffer nack val s3_vec_mb_nack = Wire(Bool()) s3_vec_mb_nack := false.B @@ -1455,24 +1492,25 @@ class LoadUnit(implicit p: Parameters) extends XSModule io.fast_rep_out.valid := s3_valid && s3_fast_rep && !s3_in.uop.robIdx.needFlush(io.redirect) io.fast_rep_out.bits := s3_in - io.lsq.ldin.valid := s3_valid && (!s3_fast_rep || s3_fast_rep_canceled) && !s3_in.feedbacked && !s3_frm_mabuf && !s3_nc_with_data + io.lsq.ldin.valid := s3_valid && (!s3_fast_rep || s3_fast_rep_canceled) && !s3_in.feedbacked && !s3_nc_with_data && !s3_in.misalignNeedWakeUp // TODO: check this --by hx // io.lsq.ldin.valid := s3_valid && (!s3_fast_rep || !io.fast_rep_out.ready) && !s3_in.feedbacked && !s3_in.lateKill io.lsq.ldin.bits := s3_in io.lsq.ldin.bits.miss := s3_in.miss // connect to misalignBuffer - io.misalign_buf.valid := io.lsq.ldin.valid && GatedValidRegNext(io.csrCtrl.hd_misalign_ld_enable) && !io.lsq.ldin.bits.isvec + val toMisalignBufferValid = io.lsq.ldin.valid && s3_mis_align && !s3_frm_mabuf + io.misalign_buf.valid := toMisalignBufferValid io.misalign_buf.bits := s3_in /* <------- DANGEROUS: Don't change sequence here ! -------> */ io.lsq.ldin.bits.data_wen_dup := s3_ld_valid_dup.asBools io.lsq.ldin.bits.replacementUpdated := io.dcache.resp.bits.replacementUpdated io.lsq.ldin.bits.missDbUpdated := GatedValidRegNext(s2_fire && s2_in.hasROBEntry && !s2_in.tlbMiss && !s2_in.missDbUpdated) + io.lsq.ldin.bits.updateAddrValid := (!s3_in.isMisalign || s3_in.misalignWith16Byte) && (!s3_frm_mabuf || s3_in.isFinalSplit) || (s3_exception && !s3_onlyMisalignException) io.s3_dly_ld_err := false.B // s3_dly_ld_err && s3_valid io.lsq.ldin.bits.dcacheRequireReplay := s3_dcache_rep - io.fast_rep_out.bits.delayedLoadError := s3_dly_ld_err val s3_vp_match_fail = GatedValidRegNext(s2_fwd_vp_match_invalid) && s3_troublem val s3_rep_frm_fetch = s3_vp_match_fail @@ -1482,20 +1520,38 @@ class LoadUnit(implicit p: Parameters) extends XSModule GatedValidRegNext(io.csrCtrl.ldld_vio_check_enable) val s3_flushPipe = s3_ldld_rep_inst - val s3_rep_info = WireInit(s3_in.rep_info) - val s3_sel_rep_cause = PriorityEncoderOH(s3_rep_info.cause.asUInt) + val s3_lrq_rep_info = WireInit(s3_in.rep_info) + s3_lrq_rep_info.misalign_nack := toMisalignBufferValid && !io.misalign_buf.ready + val s3_lrq_sel_rep_cause = PriorityEncoderOH(s3_lrq_rep_info.cause.asUInt) + val s3_replayqueue_rep_cause = WireInit(0.U.asTypeOf(s3_in.rep_info.cause)) + - when (s3_exception || s3_dly_ld_err || s3_rep_frm_fetch) { - io.lsq.ldin.bits.rep_info.cause := 0.U.asTypeOf(s3_rep_info.cause.cloneType) + val s3_mab_rep_info = WireInit(s3_in.rep_info) + val s3_mab_sel_rep_cause = PriorityEncoderOH(s3_mab_rep_info.cause.asUInt) + val s3_misalign_rep_cause = WireInit(0.U.asTypeOf(s3_in.rep_info.cause)) + + s3_misalign_rep_cause := Mux( + s3_in.misalignNeedWakeUp, + 0.U.asTypeOf(s3_mab_rep_info.cause.cloneType), + VecInit(s3_mab_sel_rep_cause.asBools) + ) + + when (s3_exception || s3_dly_ld_err || s3_rep_frm_fetch || s3_frm_mabuf) { + s3_replayqueue_rep_cause := 0.U.asTypeOf(s3_lrq_rep_info.cause.cloneType) + s3_replayqueue_rep_cause(LoadReplayCauses.C_MF) := s3_onlyMisalignException && !s3_frm_mabuf && s3_lrq_rep_info.misalign_nack } .otherwise { - io.lsq.ldin.bits.rep_info.cause := VecInit(s3_sel_rep_cause.asBools) + s3_replayqueue_rep_cause := VecInit(s3_lrq_sel_rep_cause.asBools) + } + io.lsq.ldin.bits.rep_info.cause := s3_replayqueue_rep_cause + // Int load, if hit, will be writebacked at s3 s3_out.valid := s3_valid && s3_safe_writeback s3_out.bits.uop := s3_in.uop s3_out.bits.uop.fpWen := s3_in.uop.fpWen s3_out.bits.uop.exceptionVec(loadAccessFault) := (s3_dly_ld_err || s3_in.uop.exceptionVec(loadAccessFault)) && s3_vecActive + s3_out.bits.uop.exceptionVec(loadAddrMisaligned) := s3_in.mmio && s3_in.isMisalign s3_out.bits.uop.flushPipe := false.B s3_out.bits.uop.replayInst := false.B s3_out.bits.data := s3_in.data @@ -1527,18 +1583,22 @@ class LoadUnit(implicit p: Parameters) extends XSModule s3_vecout.vecTriggerMask := s3_in.vecTriggerMask val s3_usSecondInv = s3_in.usSecondInv - io.rollback.valid := s3_valid && (s3_rep_frm_fetch || s3_flushPipe) && !s3_exception + val s3_frm_mis_flush = s3_frm_mabuf && + (io.misalign_ldout.bits.rep_info.fwd_fail || io.misalign_ldout.bits.rep_info.mem_amb || io.misalign_ldout.bits.rep_info.nuke) + + io.rollback.valid := s3_valid && (s3_rep_frm_fetch || s3_flushPipe || s3_frm_mis_flush) && !s3_exception io.rollback.bits := DontCare io.rollback.bits.isRVC := s3_out.bits.uop.preDecodeInfo.isRVC io.rollback.bits.robIdx := s3_out.bits.uop.robIdx io.rollback.bits.ftqIdx := s3_out.bits.uop.ftqPtr io.rollback.bits.ftqOffset := s3_out.bits.uop.ftqOffset - io.rollback.bits.level := Mux(s3_rep_frm_fetch, RedirectLevel.flush, RedirectLevel.flushAfter) + io.rollback.bits.level := Mux(s3_rep_frm_fetch || s3_frm_mis_flush, RedirectLevel.flush, RedirectLevel.flushAfter) io.rollback.bits.cfiUpdate.target := s3_out.bits.uop.pc io.rollback.bits.debug_runahead_checkpoint_id := s3_out.bits.uop.debugInfo.runahead_checkpoint_id /* <------- DANGEROUS: Don't change sequence here ! -------> */ io.lsq.ldin.bits.uop := s3_out.bits.uop +// io.lsq.ldin.bits.uop.exceptionVec(loadAddrMisaligned) := Mux(s3_in.onlyMisalignException, false.B, s3_in.uop.exceptionVec(loadAddrMisaligned)) val s3_revoke = s3_exception || io.lsq.ldin.bits.rep_info.need_rep io.lsq.ldld_nuke_query.revoke := s3_revoke @@ -1554,7 +1614,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule // feedback: scalar load will send feedback to RS // vector load will send signal to VL Merge Buffer, then send feedback at granularity of uops io.feedback_slow.valid := s3_valid && s3_fb_no_waiting && !s3_isvec && !s3_frm_mabuf - io.feedback_slow.bits.hit := !s3_rep_info.need_rep || io.lsq.ldin.ready + io.feedback_slow.bits.hit := !s3_lrq_rep_info.need_rep || io.lsq.ldin.ready io.feedback_slow.bits.flushState := s3_in.ptwBack io.feedback_slow.bits.robIdx := s3_in.uop.robIdx io.feedback_slow.bits.sqIdx := s3_in.uop.sqIdx @@ -1563,7 +1623,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule io.feedback_slow.bits.dataInvalidSqIdx := DontCare // TODO: vector wakeup? - io.ldCancel.ld2Cancel := s3_valid && !s3_safe_wakeup && !s3_isvec && !s3_frm_mabuf + io.ldCancel.ld2Cancel := s3_valid && !s3_safe_wakeup && !s3_isvec && (!s3_frm_mabuf || s3_in.misalignNeedWakeUp) val s3_ld_wb_meta = Mux(s3_valid, s3_out.bits, s3_mmio.bits) @@ -1583,10 +1643,10 @@ class LoadUnit(implicit p: Parameters) extends XSModule val s3_ld_data_frm_mmio = rdataHelper(s3_ld_raw_data_frm_mmio.uop, s3_picked_data_frm_mmio) /* data from pipe, which forward from respectively - * dcache hit: [D channel, mshr, sbuffer, sq] + * dcache hit: [D channel, mshr, sbuffer, sq] * nc_with_data: [sq] */ - + val s2_ld_data_frm_nc = shiftDataToHigh(s2_out.paddr, s2_out.data) val s3_ld_raw_data_frm_pipe = Wire(new LoadDataFromDcacheBundle) @@ -1651,7 +1711,13 @@ class LoadUnit(implicit p: Parameters) extends XSModule val s3_picked_data_frm_pipe = VecInit((0 until LdDataDup).map(i => { Mux1H(s3_data_select_by_offset, s3_data_frm_pipe(i)) })) - val s3_ld_data_frm_pipe = newRdataHelper(s3_data_select, s3_picked_data_frm_pipe(0)) + val s3_shift_data = Mux( + s3_in.misalignWith16Byte, + (s3_merged_data_frm_pipe >> (s3_in.vaddr(3, 0) << 3)).asUInt(63, 0), + s3_picked_data_frm_pipe(0) + ) + + val s3_ld_data_frm_pipe = newRdataHelper(s3_data_select, s3_shift_data) // FIXME: add 1 cycle delay ? // io.lsq.uncache.ready := !s3_valid @@ -1669,6 +1735,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule FuType.ldu.U ) + XSError(s3_valid && s3_in.misalignNeedWakeUp && !s3_frm_mabuf, "Only the needwakeup from the misalignbuffer may be high") // TODO: check this --hx // io.ldout.valid := s3_out.valid && !s3_out.bits.uop.robIdx.needFlush(io.redirect) && !s3_vecout.isvec || // io.lsq.uncache.valid && !io.lsq.uncache.bits.uop.robIdx.needFlush(io.redirect) && !s3_out.valid && !io.lsq.uncache.bits.isVls @@ -1680,8 +1747,9 @@ class LoadUnit(implicit p: Parameters) extends XSModule io.fast_rep_out.valid := s3_valid && s3_fast_rep io.fast_rep_out.bits := s3_in io.fast_rep_out.bits.lateKill := s3_rep_frm_fetch + io.fast_rep_out.bits.delayedLoadError := s3_dly_ld_err - val vecFeedback = s3_valid && s3_fb_no_waiting && s3_rep_info.need_rep && !io.lsq.ldin.ready && s3_isvec + val vecFeedback = s3_valid && s3_fb_no_waiting && s3_lrq_rep_info.need_rep && !io.lsq.ldin.ready && s3_isvec // vector output io.vecldout.bits.alignedType := s3_vec_alignedType @@ -1689,7 +1757,8 @@ class LoadUnit(implicit p: Parameters) extends XSModule io.vecldout.bits.vecFeedback := vecFeedback // TODO: VLSU, uncache data logic val vecdata = rdataVecHelper(s3_vec_alignedType(1,0), s3_picked_data_frm_pipe(1)) - io.vecldout.bits.vecdata.get := Mux(s3_in.is128bit, s3_merged_data_frm_pipe, vecdata) + val vecShiftData = (s3_merged_data_frm_pipe >> (s3_in.vaddr(3, 0) << 3)).asUInt(63, 0) + io.vecldout.bits.vecdata.get := Mux(s3_in.misalignWith16Byte, vecShiftData, Mux(s3_in.is128bit, s3_merged_data_frm_pipe, vecdata)) io.vecldout.bits.isvec := s3_vecout.isvec io.vecldout.bits.elemIdx := s3_vecout.elemIdx io.vecldout.bits.elemIdxInsideVd.get := s3_vecout.elemIdxInsideVd @@ -1697,7 +1766,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule io.vecldout.bits.reg_offset.get := s3_vecout.reg_offset io.vecldout.bits.usSecondInv := s3_usSecondInv io.vecldout.bits.mBIndex := s3_vec_mBIndex - io.vecldout.bits.hit := !s3_rep_info.need_rep || io.lsq.ldin.ready + io.vecldout.bits.hit := !s3_lrq_rep_info.need_rep || io.lsq.ldin.ready io.vecldout.bits.sourceType := RSFeedbackType.lrqFull io.vecldout.bits.trigger := s3_vecout.trigger io.vecldout.bits.flushState := DontCare @@ -1711,7 +1780,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule io.vecldout.bits.vecTriggerMask := s3_vecout.vecTriggerMask io.vecldout.bits.nc := DontCare - io.vecldout.valid := s3_out.valid && !s3_out.bits.uop.robIdx.needFlush(io.redirect) && s3_vecout.isvec //|| + io.vecldout.valid := s3_out.valid && !s3_out.bits.uop.robIdx.needFlush(io.redirect) && s3_vecout.isvec && !s3_mis_align && !s3_frm_mabuf //|| // TODO: check this, why !io.lsq.uncache.bits.isVls before? // Now vector instruction don't support mmio. // io.lsq.uncache.valid && !io.lsq.uncache.bits.uop.robIdx.needFlush(io.redirect) && !s3_out.valid && io.lsq.uncache.bits.isVls @@ -1719,11 +1788,12 @@ class LoadUnit(implicit p: Parameters) extends XSModule io.misalign_ldout.valid := s3_valid && (!s3_fast_rep || s3_fast_rep_canceled) && s3_frm_mabuf io.misalign_ldout.bits := io.lsq.ldin.bits - io.misalign_ldout.bits.data := Mux(s3_in.is128bit, s3_merged_data_frm_pipe, s3_picked_data_frm_pipe(2)) + io.misalign_ldout.bits.data := Mux(s3_in.misalignWith16Byte, s3_merged_data_frm_pipe, s3_picked_data_frm_pipe(2)) + io.misalign_ldout.bits.rep_info.cause := s3_misalign_rep_cause // fast load to load forward if (EnableLoadToLoadForward) { - io.l2l_fwd_out.valid := s3_valid && !s3_in.mmio && !s3_in.nc && !s3_rep_info.need_rep + io.l2l_fwd_out.valid := s3_valid && !s3_in.mmio && !s3_in.nc && !s3_lrq_rep_info.need_rep io.l2l_fwd_out.data := Mux(s3_in.vaddr(3), s3_merged_data_frm_pipe(127, 64), s3_merged_data_frm_pipe(63, 0)) io.l2l_fwd_out.dly_ld_err := s3_dly_ld_err || // ecc delayed error s3_ldld_rep_inst || @@ -1748,8 +1818,8 @@ class LoadUnit(implicit p: Parameters) extends XSModule io.debug_ls.s3_isReplayFast := s3_valid && s3_fast_rep && !s3_fast_rep_canceled io.debug_ls.s3_isReplayRS := RegNext(io.feedback_fast.valid && !io.feedback_fast.bits.hit) || (io.feedback_slow.valid && !io.feedback_slow.bits.hit) io.debug_ls.s3_isReplaySlow := io.lsq.ldin.valid && io.lsq.ldin.bits.rep_info.need_rep - io.debug_ls.s3_isReplay := s3_valid && s3_rep_info.need_rep // include fast+slow+rs replay - io.debug_ls.replayCause := s3_rep_info.cause + io.debug_ls.s3_isReplay := s3_valid && s3_lrq_rep_info.need_rep // include fast+slow+rs replay + io.debug_ls.replayCause := s3_lrq_rep_info.cause io.debug_ls.replayCnt := 1.U // Topdown @@ -1787,6 +1857,11 @@ class LoadUnit(implicit p: Parameters) extends XSModule XSPerfAccumulate("s0_hardware_prefetch_blocked", io.prefetch_req.valid && !s0_hw_prf_select) XSPerfAccumulate("s0_hardware_prefetch_total", io.prefetch_req.valid) + XSPerfAccumulate("s3_rollback_total", io.rollback.valid) + XSPerfAccumulate("s3_rep_frm_fetch_rollback", io.rollback.valid && s3_rep_frm_fetch) + XSPerfAccumulate("s3_flushPipe_rollback", io.rollback.valid && s3_flushPipe) + XSPerfAccumulate("s3_frm_mis_flush_rollback", io.rollback.valid && s3_frm_mis_flush) + XSPerfAccumulate("s1_in_valid", s1_valid) XSPerfAccumulate("s1_in_fire", s1_fire) XSPerfAccumulate("s1_in_fire_first_issue", s1_fire && s1_in.isFirstIssue) @@ -1850,5 +1925,17 @@ class LoadUnit(implicit p: Parameters) extends XSModule when(io.ldout.fire){ XSDebug("ldout %x\n", io.ldout.bits.uop.pc) } + + if (backendParams.debugEn){ + dontTouch(s0_src_valid_vec) + dontTouch(s0_src_ready_vec) + dontTouch(s0_src_select_vec) + dontTouch(s3_ld_data_frm_pipe) + dontTouch(s3_shift_data) + s3_data_select_by_offset.map(x=> dontTouch(x)) + s3_data_frm_pipe.map(x=> dontTouch(x)) + s3_picked_data_frm_pipe.map(x=> dontTouch(x)) + } + // end } diff --git a/src/main/scala/xiangshan/mem/pipeline/StoreUnit.scala b/src/main/scala/xiangshan/mem/pipeline/StoreUnit.scala index c4e0d23ec9..2f8a6c20a5 100644 --- a/src/main/scala/xiangshan/mem/pipeline/StoreUnit.scala +++ b/src/main/scala/xiangshan/mem/pipeline/StoreUnit.scala @@ -66,9 +66,11 @@ class StoreUnit(implicit p: Parameters) extends XSModule val vecstin = Flipped(Decoupled(new VecPipeBundle(isVStore = true))) val vec_isFirstIssue = Input(Bool()) // writeback to misalign buffer - val misalign_buf = Valid(new LsPipelineBundle) + val misalign_buf = Decoupled(new LsPipelineBundle) // trigger val fromCsrTrigger = Input(new CsrTriggerBundle) + + val s0_s1_valid = Output(Bool()) }) val s1_ready, s2_ready, s3_ready = WireInit(false.B) @@ -114,7 +116,7 @@ class StoreUnit(implicit p: Parameters) extends XSModule val s0_kill = s0_uop.robIdx.needFlush(io.redirect) val s0_can_go = s1_ready val s0_fire = s0_valid && !s0_kill && s0_can_go - val s0_is128bit = Mux(s0_use_flow_ma, io.misalign_stin.bits.is128bit, is128Bit(s0_vecstin.alignedType)) + val s0_is128bit = Wire(Bool()) // vector val s0_vecActive = !s0_use_flow_vec || s0_vecstin.vecActive // val s0_flowPtr = s0_vecstin.flowPtr @@ -124,10 +126,12 @@ class StoreUnit(implicit p: Parameters) extends XSModule val s0_alignedType = s0_vecstin.alignedType val s0_mBIndex = s0_vecstin.mBIndex val s0_vecBaseVaddr = s0_vecstin.basevaddr + val s0_isFinalSplit = io.misalign_stin.valid && io.misalign_stin.bits.isFinalSplit // generate addr val s0_saddr = s0_stin.src(0) + SignExt(s0_stin.uop.imm(11,0), VAddrBits) val s0_fullva = Wire(UInt(XLEN.W)) + val s0_vaddr = Mux( s0_use_flow_ma, io.misalign_stin.bits.vaddr, @@ -141,6 +145,30 @@ class StoreUnit(implicit p: Parameters) extends XSModule ) ) ) + + val s0_alignTpye = Mux(s0_use_flow_vec, s0_vecstin.alignedType(1,0), s0_uop.fuOpType(1, 0)) + // exception check + val s0_addr_aligned = LookupTree(s0_alignTpye, List( + "b00".U -> true.B, //b + "b01".U -> (s0_vaddr(0) === 0.U), //h + "b10".U -> (s0_vaddr(1,0) === 0.U), //w + "b11".U -> (s0_vaddr(2,0) === 0.U) //d + )) + // if vector store sends 128-bit requests, its address must be 128-aligned + XSError(s0_use_flow_vec && s0_vaddr(3, 0) =/= 0.U && s0_vecstin.alignedType(2), "unit stride 128 bit element is not aligned!") + + val s0_isMisalign = Mux(s0_use_non_prf_flow, (!s0_addr_aligned || s0_vecstin.uop.exceptionVec(storeAddrMisaligned) && s0_vecActive), false.B) + val s0_addr_low = s0_vaddr(4, 0) + val s0_addr_Up_low = LookupTree(s0_alignTpye, List( + "b00".U -> 0.U, + "b01".U -> 1.U, + "b10".U -> 3.U, + "b11".U -> 7.U + )) +& s0_addr_low + val s0_rs_corss16Bytes = s0_addr_Up_low(4) =/= s0_addr_low(4) + val s0_misalignWith16Byte = !s0_rs_corss16Bytes && !s0_addr_aligned && !s0_use_flow_prf + s0_is128bit := Mux(s0_use_flow_ma, io.misalign_stin.bits.is128bit, is128Bit(s0_vecstin.alignedType) || s0_misalignWith16Byte) + s0_fullva := Mux( s0_use_flow_rs, s0_stin.src(0) + SignExt(s0_stin.uop.imm(11,0), XLEN), @@ -205,7 +233,8 @@ class StoreUnit(implicit p: Parameters) extends XSModule s0_out.data := s0_stin.src(1) s0_out.uop := s0_uop s0_out.miss := false.B - s0_out.mask := s0_mask + // For unaligned, we need to generate a base-aligned mask in storeunit and then do a shift split in StoreQueue. + s0_out.mask := Mux(s0_rs_corss16Bytes && !s0_addr_aligned, genBasemask(s0_saddr,s0_alignTpye(1,0)), s0_mask) s0_out.isFirstIssue := s0_isFirstIssue s0_out.isHWPrefetch := s0_use_flow_prf s0_out.wlineflag := s0_wlineflag @@ -216,22 +245,15 @@ class StoreUnit(implicit p: Parameters) extends XSModule s0_out.elemIdx := s0_elemIdx s0_out.alignedType := s0_alignedType s0_out.mbIndex := s0_mBIndex + s0_out.misalignWith16Byte := s0_misalignWith16Byte + s0_out.isMisalign := s0_isMisalign s0_out.vecBaseVaddr := s0_vecBaseVaddr when(s0_valid && s0_isFirstIssue) { s0_out.uop.debugInfo.tlbFirstReqTime := GTimer() } s0_out.isFrmMisAlignBuf := s0_use_flow_ma - - // exception check - val s0_addr_aligned = LookupTree(Mux(s0_use_flow_vec, s0_vecstin.alignedType(1,0), s0_uop.fuOpType(1, 0)), List( - "b00".U -> true.B, //b - "b01".U -> (s0_out.vaddr(0) === 0.U), //h - "b10".U -> (s0_out.vaddr(1,0) === 0.U), //w - "b11".U -> (s0_out.vaddr(2,0) === 0.U) //d - )) - // if vector store sends 128-bit requests, its address must be 128-aligned - XSError(s0_use_flow_vec && s0_out.vaddr(3, 0) =/= 0.U && s0_vecstin.alignedType(2), "unit stride 128 bit element is not aligned!") - s0_out.uop.exceptionVec(storeAddrMisaligned) := Mux(s0_use_non_prf_flow, (!s0_addr_aligned || s0_vecstin.uop.exceptionVec(storeAddrMisaligned) && s0_vecActive), false.B) + s0_out.isFinalSplit := s0_isFinalSplit +// s0_out.uop.exceptionVec(storeAddrMisaligned) := Mux(s0_use_non_prf_flow, (!s0_addr_aligned || s0_vecstin.uop.exceptionVec(storeAddrMisaligned) && s0_vecActive), false.B) && !s0_misalignWith16Byte io.st_mask_out.valid := s0_use_flow_rs || s0_use_flow_vec io.st_mask_out.bits.mask := s0_out.mask @@ -255,6 +277,7 @@ class StoreUnit(implicit p: Parameters) extends XSModule val s1_fire = s1_valid && !s1_kill && s1_can_go val s1_vecActive = RegEnable(s0_out.vecActive, true.B, s0_fire) val s1_frm_mabuf = s1_in.isFrmMisAlignBuf + val s1_is128bit = s1_in.is128bit // mmio cbo decoder val s1_mmio_cbo = s1_in.uop.fuOpType === LSUOpType.cbo_clean || @@ -285,18 +308,40 @@ class StoreUnit(implicit p: Parameters) extends XSModule io.stld_nuke_query.bits.robIdx := s1_in.uop.robIdx io.stld_nuke_query.bits.paddr := s1_paddr io.stld_nuke_query.bits.mask := s1_in.mask - io.stld_nuke_query.bits.matchLine := s1_in.isvec && s1_in.is128bit + io.stld_nuke_query.bits.matchLine := (s1_in.isvec || s1_in.misalignWith16Byte) && s1_in.is128bit // issue io.issue.valid := s1_valid && !s1_tlb_miss && !s1_in.isHWPrefetch && !s1_isvec && !s1_frm_mabuf io.issue.bits := RegEnable(s0_stin, s0_valid) + // trigger + val storeTrigger = Module(new MemTrigger(MemType.STORE)) + storeTrigger.io.fromCsrTrigger.tdataVec := io.fromCsrTrigger.tdataVec + storeTrigger.io.fromCsrTrigger.tEnableVec := io.fromCsrTrigger.tEnableVec + storeTrigger.io.fromCsrTrigger.triggerCanRaiseBpExp := io.fromCsrTrigger.triggerCanRaiseBpExp + storeTrigger.io.fromCsrTrigger.debugMode := io.fromCsrTrigger.debugMode + storeTrigger.io.fromLoadStore.vaddr := s1_in.vaddr + storeTrigger.io.fromLoadStore.isVectorUnitStride := s1_in.isvec && s1_in.is128bit + storeTrigger.io.fromLoadStore.mask := s1_in.mask + + val s1_trigger_action = storeTrigger.io.toLoadStore.triggerAction + val s1_trigger_debug_mode = TriggerAction.isDmode(s1_trigger_action) + val s1_trigger_breakpoint = TriggerAction.isExp(s1_trigger_action) + + // goto misalignBuffer + val toMisalignBufferValid = s1_valid && !s1_tlb_miss && !s1_in.isHWPrefetch && GatedValidRegNext(io.csrCtrl.hd_misalign_st_enable) && s1_in.isMisalign && !s1_in.misalignWith16Byte && !s1_trigger_breakpoint && !s1_trigger_debug_mode + io.misalign_buf.valid := toMisalignBufferValid + io.misalign_buf.bits := io.lsq.bits + val misalignBufferNack = toMisalignBufferValid && !io.misalign_buf.ready + + // for misalign in vsMergeBuffer + io.s0_s1_valid := s0_valid || s1_valid // Send TLB feedback to store issue queue // Store feedback is generated in store_s1, sent to RS in store_s2 val s1_feedback = Wire(Valid(new RSFeedback)) s1_feedback.valid := s1_valid & !s1_in.isHWPrefetch - s1_feedback.bits.hit := !s1_tlb_miss + s1_feedback.bits.hit := !s1_tlb_miss && !misalignBufferNack s1_feedback.bits.flushState := io.tlb.resp.bits.ptwBack s1_feedback.bits.robIdx := s1_out.uop.robIdx s1_feedback.bits.sourceType := RSFeedbackType.tlbMiss @@ -326,7 +371,7 @@ class StoreUnit(implicit p: Parameters) extends XSModule s1_out.tlbMiss := s1_tlb_miss s1_out.atomic := s1_mmio || Pbmt.isIO(s1_pbmt) s1_out.isForVSnonLeafPTE := s1_isForVSnonLeafPTE - when (!s1_out.isvec && RegNext(io.tlb.req.bits.checkfullva) && + when (RegNext(io.tlb.req.bits.checkfullva) && (s1_out.uop.exceptionVec(storePageFault) || s1_out.uop.exceptionVec(storeAccessFault) || s1_out.uop.exceptionVec(storeGuestPageFault))) { @@ -336,23 +381,10 @@ class StoreUnit(implicit p: Parameters) extends XSModule s1_out.uop.exceptionVec(storeAccessFault) := io.tlb.resp.bits.excp(0).af.st && s1_vecActive s1_out.uop.exceptionVec(storeGuestPageFault) := io.tlb.resp.bits.excp(0).gpf.st && s1_vecActive - // trigger - val storeTrigger = Module(new MemTrigger(MemType.STORE)) - storeTrigger.io.fromCsrTrigger.tdataVec := io.fromCsrTrigger.tdataVec - storeTrigger.io.fromCsrTrigger.tEnableVec := io.fromCsrTrigger.tEnableVec - storeTrigger.io.fromCsrTrigger.triggerCanRaiseBpExp := io.fromCsrTrigger.triggerCanRaiseBpExp - storeTrigger.io.fromCsrTrigger.debugMode := io.fromCsrTrigger.debugMode - storeTrigger.io.fromLoadStore.vaddr := s1_in.vaddr - storeTrigger.io.fromLoadStore.isVectorUnitStride := s1_in.isvec && s1_in.is128bit - storeTrigger.io.fromLoadStore.mask := s1_in.mask - - val s1_trigger_action = storeTrigger.io.toLoadStore.triggerAction - val s1_trigger_debug_mode = TriggerAction.isDmode(s1_trigger_action) - val s1_trigger_breakpoint = TriggerAction.isExp(s1_trigger_action) - s1_out.uop.flushPipe := false.B s1_out.uop.trigger := s1_trigger_action s1_out.uop.exceptionVec(breakPoint) := s1_trigger_breakpoint + s1_out.uop.exceptionVec(storeAddrMisaligned) := s1_mmio && s1_in.isMisalign s1_out.vecVaddrOffset := Mux( s1_trigger_debug_mode || s1_trigger_breakpoint, storeTrigger.io.toLoadStore.triggerVaddr - s1_in.vecBaseVaddr, @@ -361,14 +393,12 @@ class StoreUnit(implicit p: Parameters) extends XSModule s1_out.vecTriggerMask := Mux(s1_trigger_debug_mode || s1_trigger_breakpoint, storeTrigger.io.toLoadStore.triggerMask, 0.U) // scalar store and scalar load nuke check, and also other purposes - io.lsq.valid := s1_valid && !s1_in.isHWPrefetch && !s1_frm_mabuf + //A 128-bit aligned unaligned memory access requires changing the unaligned flag bit in sq + io.lsq.valid := s1_valid && !s1_in.isHWPrefetch io.lsq.bits := s1_out io.lsq.bits.miss := s1_tlb_miss - - // goto misalignBuffer - io.misalign_buf.valid := s1_valid && !s1_tlb_miss && !s1_in.isHWPrefetch && GatedValidRegNext(io.csrCtrl.hd_misalign_st_enable) && !s1_in.isvec - io.misalign_buf.bits := io.lsq.bits - + io.lsq.bits.isFinalSplit := !s1_frm_mabuf || s1_exception + io.lsq.bits.updateAddrValid := (!s1_in.isMisalign || s1_in.misalignWith16Byte) && (!s1_frm_mabuf || s1_in.isFinalSplit) || s1_exception // kill dcache write intent request when tlb miss or exception io.dcache.s1_kill := (s1_tlb_miss || s1_exception || s1_mmio || s1_in.uop.robIdx.needFlush(io.redirect)) io.dcache.s1_paddr := s1_paddr @@ -395,8 +425,7 @@ class StoreUnit(implicit p: Parameters) extends XSModule val s2_frm_mabuf = s2_in.isFrmMisAlignBuf val s2_pbmt = RegEnable(s1_pbmt, s1_fire) val s2_trigger_debug_mode = RegEnable(s1_trigger_debug_mode, false.B, s1_fire) - val s2_mis_align = GatedValidRegNext(io.csrCtrl.hd_misalign_st_enable) && !s2_in.isvec && - s2_in.uop.exceptionVec(storeAddrMisaligned) && !s2_in.uop.exceptionVec(breakPoint) && !s2_trigger_debug_mode + val s2_mis_align = RegEnable(toMisalignBufferValid, s1_fire) s2_ready := !s2_valid || s2_kill || s3_ready when (s1_fire) { s2_valid := true.B } @@ -418,9 +447,10 @@ class StoreUnit(implicit p: Parameters) extends XSModule s2_out.atomic := s2_in.atomic || Pbmt.isPMA(s2_pbmt) && s2_pmp.atomic s2_out.uop.exceptionVec(storeAccessFault) := (s2_in.uop.exceptionVec(storeAccessFault) || s2_pmp.st || - (s2_in.isvec && s2_actually_uncache && RegNext(s1_feedback.bits.hit)) + ((s2_in.isvec || s2_frm_mabuf) && s2_actually_uncache && RegNext(s1_feedback.bits.hit)) ) && s2_vecActive - s2_out.uop.vpu.vstart := s2_in.vecVaddrOffset >> s2_in.uop.vpu.veew + s2_out.uop.exceptionVec(storeAddrMisaligned) := s2_mmio && s2_in.isMisalign + s2_out.uop.vpu.vstart := s2_in.vecVaddrOffset >> s2_in.uop.vpu.veew // kill dcache write intent request when mmio or exception io.dcache.s2_kill := (s2_uncache || s2_exception || s2_in.uop.robIdx.needFlush(io.redirect)) @@ -475,6 +505,11 @@ class StoreUnit(implicit p: Parameters) extends XSModule // TODO: add prefetch and access bit io.prefetch_train.bits.meta_prefetch := false.B io.prefetch_train.bits.meta_access := false.B + io.prefetch_train.bits.isFinalSplit := false.B + io.prefetch_train.bits.misalignWith16Byte := false.B + io.prefetch_train.bits.isMisalign := false.B + io.prefetch_train.bits.misalignNeedWakeUp := false.B + io.prefetch_train.bits.updateAddrValid := false.B // Pipeline // -------------------------------------------------------------------------------- diff --git a/src/main/scala/xiangshan/mem/vector/VMergeBuffer.scala b/src/main/scala/xiangshan/mem/vector/VMergeBuffer.scala index bbe85a3c27..aac9d69553 100644 --- a/src/main/scala/xiangshan/mem/vector/VMergeBuffer.scala +++ b/src/main/scala/xiangshan/mem/vector/VMergeBuffer.scala @@ -306,8 +306,8 @@ abstract class BaseVMergeBuffer(isVStore: Boolean=false)(implicit p: Parameters) needRSReplay(wbIndex) := true.B } pipewb.ready := true.B - XSError((entries(latchWbIndex).flowNum - latchFlowNum > entries(latchWbIndex).flowNum) && latchWbValid && !latchMergeByPre, "FlowWriteback overflow!!\n") - XSError(!allocated(latchWbIndex) && latchWbValid, "Writeback error flow!!\n") + XSError((entries(latchWbIndex).flowNum - latchFlowNum > entries(latchWbIndex).flowNum) && latchWbValid && !latchMergeByPre, s"entry: $latchWbIndex, FlowWriteback overflow!!\n") + XSError(!allocated(latchWbIndex) && latchWbValid, s"entry: $latchWbIndex, Writeback error flow!!\n") } // for inorder mem asscess io.toSplit := DontCare @@ -468,4 +468,9 @@ class VSMergeBufferImp(implicit p: Parameters) extends BaseVMergeBuffer(isVStore sink.uop.vpu.vstart := source.vstart sink } + + // from misalignBuffer flush + when(io.fromMisalignBuffer.get.flush){ + needRSReplay(io.fromMisalignBuffer.get.mbIndex) := true.B + } } diff --git a/src/main/scala/xiangshan/mem/vector/VSegmentUnit.scala b/src/main/scala/xiangshan/mem/vector/VSegmentUnit.scala index 5cee00e407..c9f75f2cc1 100644 --- a/src/main/scala/xiangshan/mem/vector/VSegmentUnit.scala +++ b/src/main/scala/xiangshan/mem/vector/VSegmentUnit.scala @@ -90,6 +90,7 @@ class VSegmentUnit (implicit p: Parameters) extends VLSUModule } } + val maxSplitNum = 2 /** ******************************************************************************************************** @@ -230,19 +231,37 @@ class VSegmentUnit (implicit p: Parameters) extends VLSUModule * s_pm: check pmp * s_cache_req: request cache * s_cache_resp: wait cache resp + * s_misalign_merge_data: merge unaligned data * s_latch_and_merge_data: for read data * s_send_data: for send write data * s_wait_to_sbuffer: Wait for data from the sbufferOut pipelayer to be sent to the sbuffer - * s_finish: + * s_finish: normal uop is complete * s_fof_fix_vl: Writeback the uop of the fof instruction to modify vl. * */ - val s_idle :: s_flush_sbuffer_req :: s_wait_flush_sbuffer_resp :: s_tlb_req :: s_wait_tlb_resp :: s_pm ::s_cache_req :: s_cache_resp :: s_latch_and_merge_data :: s_send_data :: s_wait_to_sbuffer :: s_finish :: s_fof_fix_vl :: Nil = Enum(13) + val s_idle :: s_flush_sbuffer_req :: s_wait_flush_sbuffer_resp :: s_tlb_req :: s_wait_tlb_resp :: s_pm ::s_cache_req :: s_cache_resp :: s_misalign_merge_data :: s_latch_and_merge_data :: s_send_data :: s_wait_to_sbuffer :: s_finish :: s_fof_fix_vl :: Nil = Enum(14) val state = RegInit(s_idle) val stateNext = WireInit(s_idle) val sbufferEmpty = io.flush_sbuffer.empty val isEnqfof = io.in.bits.uop.fuOpType === VlduType.vleff && io.in.valid val isEnqFixVlUop = isEnqfof && io.in.bits.uop.vpu.lastUop + // handle misalign sign + val curPtr = RegInit(false.B) + val canHandleMisalign = WireInit(false.B) + val isMisalignReg = RegInit(false.B) + val isMisalignWire = WireInit(false.B) + val notCross16ByteReg = RegInit(false.B) + val notCross16ByteWire = WireInit(false.B) + val combinedData = RegInit(0.U(XLEN.W)) + + val lowPagePaddr = RegInit(0.U(PAddrBits.W)) + val lowPageGPaddr = RegInit(0.U(GPAddrBits.W)) + + val highPagePaddr = RegInit(0.U(PAddrBits.W)) + val highPageGPaddr = RegInit(0.U(GPAddrBits.W)) + + val isFirstSplit = !curPtr + val isSecondSplit = curPtr /** * state update */ @@ -270,10 +289,16 @@ class VSegmentUnit (implicit p: Parameters) extends VLSUModule s_wait_tlb_resp) }.elsewhen(state === s_pm){ - /* if is vStore, send data to sbuffer, so don't need query dcache */ - stateNext := Mux(exception_pa || exception_va || exception_gpa, - s_finish, - Mux(isVSegLoad, s_cache_req, s_send_data)) + when(exception_pa || exception_va || exception_gpa) { + stateNext := s_finish + } .otherwise { + when(canHandleMisalign && isMisalignWire && !notCross16ByteWire || (isMisalignReg && !notCross16ByteReg && isFirstSplit && isVSegStore)) { + stateNext := s_tlb_req + } .otherwise { + /* if is vStore, send data to sbuffer, so don't need query dcache */ + stateNext := Mux(isVSegLoad, s_cache_req, s_send_data) + } + } }.elsewhen(state === s_cache_req){ stateNext := Mux(io.rdcache.req.fire, s_cache_resp, s_cache_req) @@ -283,12 +308,14 @@ class VSegmentUnit (implicit p: Parameters) extends VLSUModule when(io.rdcache.resp.bits.miss || io.rdcache.s2_bank_conflict) { stateNext := s_cache_req }.otherwise { - stateNext := Mux(isVSegLoad, s_latch_and_merge_data, s_send_data) + + stateNext := Mux(isVSegLoad, Mux(isMisalignReg && !notCross16ByteReg, s_misalign_merge_data, s_latch_and_merge_data), s_send_data) } }.otherwise{ stateNext := s_cache_resp } - /* if segment is inactive, don't need to wait access all of the field */ + }.elsewhen(state === s_misalign_merge_data) { + stateNext := Mux(!curPtr, s_tlb_req, s_latch_and_merge_data) }.elsewhen(state === s_latch_and_merge_data) { when((segmentIdx === maxSegIdx) && (fieldIdx === maxNfields) || ((segmentIdx === maxSegIdx) && !segmentActive)) { @@ -299,7 +326,7 @@ class VSegmentUnit (implicit p: Parameters) extends VLSUModule } /* if segment is inactive, don't need to wait access all of the field */ }.elsewhen(state === s_send_data) { // when sbuffer accept data - when(!sbufferOut.fire && segmentActive) { + when(!sbufferOut.fire && segmentActive || (isMisalignReg && !notCross16ByteReg && isFirstSplit)) { stateNext := s_send_data }.elsewhen(segmentIdx === maxSegIdx && (fieldIdx === maxNfields && sbufferOut.fire || !segmentActive && io.sbuffer.valid && !io.sbuffer.ready)) { stateNext := s_wait_to_sbuffer @@ -362,6 +389,8 @@ class VSegmentUnit (implicit p: Parameters) extends VLSUModule instMicroOp.isFof := (fuOpType === VlduType.vleff) && FuType.isVSegLoad(io.in.bits.uop.fuType) instMicroOp.isVSegLoad := FuType.isVSegLoad(io.in.bits.uop.fuType) instMicroOp.isVSegStore := FuType.isVSegStore(io.in.bits.uop.fuType) + isMisalignReg := false.B + notCross16ByteReg := false.B } // latch data when(io.in.fire && !isEnqFixVlUop){ @@ -389,8 +418,14 @@ class VSegmentUnit (implicit p: Parameters) extends VLSUModule segmentOffset) val vaddr = baseVaddr + (fieldIdx << alignedType).asUInt + realSegmentOffset + val misalignLowVaddr = Cat(latchVaddr(latchVaddr.getWidth - 1, 3), 0.U(3.W)) + val misalignHighVaddr = Cat(latchVaddr(latchVaddr.getWidth - 1, 3), 0.U(3.W)) + 8.U + val notCross16ByteVaddr = Cat(latchVaddr(latchVaddr.getWidth - 1, 4), 0.U(4.W)) +// val misalignVaddr = Mux(notCross16ByteReg, notCross16ByteVaddr, Mux(isFirstSplit, misalignLowVaddr, misalignHighVaddr)) + val misalignVaddr = Mux(isFirstSplit, misalignLowVaddr, misalignHighVaddr) + val tlbReqVaddr = Mux(isMisalignReg, misalignVaddr, vaddr) //latch vaddr - when(state === s_tlb_req){ + when(state === s_tlb_req && !isMisalignReg){ latchVaddr := vaddr(VAddrBits - 1, 0) } /** @@ -402,8 +437,8 @@ class VSegmentUnit (implicit p: Parameters) extends VLSUModule io.dtlb.resp.ready := true.B io.dtlb.req.valid := state === s_tlb_req && segmentActive io.dtlb.req.bits.cmd := Mux(isVSegLoad, TlbCmd.read, TlbCmd.write) - io.dtlb.req.bits.vaddr := vaddr(VAddrBits - 1, 0) - io.dtlb.req.bits.fullva := vaddr + io.dtlb.req.bits.vaddr := tlbReqVaddr(VAddrBits - 1, 0) + io.dtlb.req.bits.fullva := tlbReqVaddr io.dtlb.req.bits.checkfullva := true.B io.dtlb.req.bits.size := instMicroOp.alignedType(2,0) io.dtlb.req.bits.memidx.is_ld := isVSegLoad @@ -422,7 +457,7 @@ class VSegmentUnit (implicit p: Parameters) extends VLSUModule segmentTrigger.io.fromCsrTrigger.triggerCanRaiseBpExp := io.fromCsrTrigger.triggerCanRaiseBpExp segmentTrigger.io.fromCsrTrigger.debugMode := io.fromCsrTrigger.debugMode segmentTrigger.io.memType := isVSegLoad - segmentTrigger.io.fromLoadStore.vaddr := latchVaddr + segmentTrigger.io.fromLoadStore.vaddr := Mux(isMisalignReg, misalignVaddr, latchVaddr) segmentTrigger.io.fromLoadStore.isVectorUnitStride := false.B segmentTrigger.io.fromLoadStore.mask := 0.U @@ -443,6 +478,11 @@ class VSegmentUnit (implicit p: Parameters) extends VLSUModule instMicroOp.exceptionVaddr := io.dtlb.resp.bits.fullva instMicroOp.exceptionGpaddr := io.dtlb.resp.bits.gpaddr(0) instMicroOp.exceptionIsForVSnonLeafPTE := io.dtlb.resp.bits.isForVSnonLeafPTE + lowPagePaddr := Mux(isMisalignReg && !notCross16ByteReg && !curPtr, io.dtlb.resp.bits.paddr(0), lowPagePaddr) + lowPageGPaddr := Mux(isMisalignReg && !notCross16ByteReg && !curPtr, io.dtlb.resp.bits.gpaddr(0), lowPageGPaddr) + + highPagePaddr := Mux(isMisalignReg && !notCross16ByteReg && curPtr, io.dtlb.resp.bits.paddr(0), highPagePaddr) + highPageGPaddr := Mux(isMisalignReg && !notCross16ByteReg && curPtr, io.dtlb.resp.bits.gpaddr(0), highPageGPaddr) } } // pmp @@ -450,18 +490,29 @@ class VSegmentUnit (implicit p: Parameters) extends VLSUModule val exceptionWithPf = exceptionVec(storePageFault) || exceptionVec(loadPageFault) || exceptionVec(storeGuestPageFault) || exceptionVec(loadGuestPageFault) val pmp = (io.pmpResp.asUInt & Fill(io.pmpResp.asUInt.getWidth, !exceptionWithPf)).asTypeOf(new PMPRespBundle()) when(state === s_pm) { + val highAddress = LookupTree(Mux(isIndexed(issueInstType), issueSew(1, 0), issueEew(1, 0)), List( + "b00".U -> 0.U, + "b01".U -> 1.U, + "b10".U -> 3.U, + "b11".U -> 7.U + )) + tlbReqVaddr(4, 0) + val addr_aligned = LookupTree(Mux(isIndexed(issueInstType), issueSew(1, 0), issueEew(1, 0)), List( "b00".U -> true.B, //b - "b01".U -> (vaddr(0) === 0.U), //h - "b10".U -> (vaddr(1, 0) === 0.U), //w - "b11".U -> (vaddr(2, 0) === 0.U) //d + "b01".U -> (tlbReqVaddr(0) === 0.U), //h + "b10".U -> (tlbReqVaddr(1, 0) === 0.U), //w + "b11".U -> (tlbReqVaddr(2, 0) === 0.U) //d )) - val missAligned = !addr_aligned - exceptionVec(loadAddrMisaligned) := missAligned && isVSegLoad && canTriggerException - exceptionVec(storeAddrMisaligned) := missAligned && isVSegStore && canTriggerException + + notCross16ByteWire := highAddress(4) === tlbReqVaddr(4) + isMisalignWire := !addr_aligned + canHandleMisalign := !pmp.mmio && !triggerBreakpoint && !triggerDebugMode + exceptionVec(loadAddrMisaligned) := isMisalignWire && isVSegLoad && canTriggerException && !canHandleMisalign + exceptionVec(storeAddrMisaligned) := isMisalignWire && isVSegStore && canTriggerException && !canHandleMisalign + exception_va := exceptionVec(storePageFault) || exceptionVec(loadPageFault) || exceptionVec(storeAccessFault) || exceptionVec(loadAccessFault) || - triggerBreakpoint || triggerDebugMode || missAligned + triggerBreakpoint || triggerDebugMode || (isMisalignWire && !canHandleMisalign) exception_gpa := exceptionVec(storeGuestPageFault) || exceptionVec(loadGuestPageFault) exception_pa := pmp.st || pmp.ld || pmp.mmio @@ -490,6 +541,12 @@ class VSegmentUnit (implicit p: Parameters) extends VLSUModule when(exceptionVec(breakPoint) || triggerDebugMode) { instMicroOp.uop.trigger := triggerAction } + + when(isMisalignWire && canHandleMisalign && !(exception_va || exception_gpa || exception_pa)) { + notCross16ByteReg := notCross16ByteWire + isMisalignReg := true.B + curPtr := false.B + } } /** @@ -497,6 +554,30 @@ class VSegmentUnit (implicit p: Parameters) extends VLSUModule */ io.flush_sbuffer.valid := !sbufferEmpty && (state === s_flush_sbuffer_req) + /** + * update curPtr + * */ + when(state === s_finish || state === s_latch_and_merge_data || state === s_send_data && stateNext =/= s_send_data) { + isMisalignReg := false.B + notCross16ByteReg := false.B + curPtr := false.B + } .otherwise { + when(isVSegLoad) { + when(isMisalignReg && !notCross16ByteReg && state === s_misalign_merge_data) { + curPtr := true.B + } + } .otherwise { + when(isMisalignReg && !notCross16ByteReg && state === s_pm) { + curPtr := !curPtr + } .elsewhen(isMisalignReg && !notCross16ByteReg && state === s_pm && stateNext === s_send_data) { + curPtr := false.B + } .elsewhen(isMisalignReg && !notCross16ByteReg && state === s_send_data && stateNext === s_send_data && sbufferOut.fire) { + curPtr := !curPtr + } + } + } + + /** * merge data for load @@ -519,7 +600,46 @@ class VSegmentUnit (implicit p: Parameters) extends VLSUModule "b1110".U -> io.rdcache.resp.bits.data_delayed(127, 112), "b1111".U -> io.rdcache.resp.bits.data_delayed(127, 120) )) - val pickData = rdataVecHelper(alignedType(1,0), cacheData) + + val misalignLowData = LookupTree(latchVaddr(3,0), List( + "b1001".U -> io.rdcache.resp.bits.data_delayed(127, 72), + "b1010".U -> io.rdcache.resp.bits.data_delayed(127, 80), + "b1011".U -> io.rdcache.resp.bits.data_delayed(127, 88), + "b1100".U -> io.rdcache.resp.bits.data_delayed(127, 96), + "b1101".U -> io.rdcache.resp.bits.data_delayed(127, 104), + "b1110".U -> io.rdcache.resp.bits.data_delayed(127, 112), + "b1111".U -> io.rdcache.resp.bits.data_delayed(127, 120) + )) + val misalignHighData = LookupTree(latchVaddr(3,0), List( + "b1001".U -> io.rdcache.resp.bits.data_delayed(63, 8), + "b1010".U -> io.rdcache.resp.bits.data_delayed(63, 16), + "b1011".U -> io.rdcache.resp.bits.data_delayed(63, 24), + "b1100".U -> io.rdcache.resp.bits.data_delayed(63, 32), + "b1101".U -> io.rdcache.resp.bits.data_delayed(63, 40), + "b1110".U -> io.rdcache.resp.bits.data_delayed(63, 48), + "b1111".U -> io.rdcache.resp.bits.data_delayed(63, 56) + )) + + val misalignCombinedData = LookupTree(latchVaddr(3,0), List( + "b1001".U -> Cat(io.rdcache.resp.bits.data_delayed, combinedData(55, 0))(63, 0), + "b1010".U -> Cat(io.rdcache.resp.bits.data_delayed, combinedData(47, 0))(63, 0), + "b1011".U -> Cat(io.rdcache.resp.bits.data_delayed, combinedData(39, 0))(63, 0), + "b1100".U -> Cat(io.rdcache.resp.bits.data_delayed, combinedData(31, 0))(63, 0), + "b1101".U -> Cat(io.rdcache.resp.bits.data_delayed, combinedData(23, 0))(63, 0), + "b1110".U -> Cat(io.rdcache.resp.bits.data_delayed, combinedData(15, 0))(63, 0), + "b1111".U -> Cat(io.rdcache.resp.bits.data_delayed, combinedData(7, 0))(63, 0) + )) + when(state === s_misalign_merge_data && segmentActive){ + when(!curPtr) { + combinedData := misalignLowData + } .otherwise { + combinedData := misalignCombinedData + } + } + + val shiftData = (io.rdcache.resp.bits.data_delayed >> (latchVaddr(3, 0) << 3)).asUInt(63, 0) + val mergemisalignData = Mux(notCross16ByteReg, shiftData, combinedData) + val pickData = rdataVecHelper(alignedType(1,0), Mux(isMisalignReg, mergemisalignData, cacheData)) val mergedData = mergeDataWithElemIdx( oldData = data(splitPtr.value), newData = Seq(pickData), @@ -530,6 +650,8 @@ class VSegmentUnit (implicit p: Parameters) extends VLSUModule when(state === s_latch_and_merge_data && segmentActive){ data(splitPtr.value) := mergedData } + + /** * split data for store * */ @@ -540,21 +662,23 @@ class VSegmentUnit (implicit p: Parameters) extends VLSUModule ) val flowData = genVWdata(splitData, alignedType) // TODO: connect vstd, pass vector data val wmask = genVWmask(latchVaddr, alignedType(1, 0)) & Fill(VLENB, segmentActive) - + val bmask = genBasemask(latchVaddr, alignedType(1, 0)) & Fill(VLENB, segmentActive) + val dcacheReqVaddr = Mux(isMisalignReg, misalignVaddr, latchVaddr) + val dcacheReqPaddr = Mux(isMisalignReg, Cat(instMicroOp.paddr(instMicroOp.paddr.getWidth - 1, PageOffsetWidth), misalignVaddr(PageOffsetWidth - 1, 0)), instMicroOp.paddr) /** * rdcache req, write request don't need to query dcache, because we write element to sbuffer */ io.rdcache.req := DontCare io.rdcache.req.valid := state === s_cache_req && isVSegLoad io.rdcache.req.bits.cmd := MemoryOpConstants.M_XRD - io.rdcache.req.bits.vaddr := latchVaddr + io.rdcache.req.bits.vaddr := dcacheReqVaddr io.rdcache.req.bits.mask := mask io.rdcache.req.bits.data := flowData io.rdcache.pf_source := LOAD_SOURCE.U io.rdcache.req.bits.id := DontCare io.rdcache.resp.ready := true.B - io.rdcache.s1_paddr_dup_lsu := instMicroOp.paddr - io.rdcache.s1_paddr_dup_dcache := instMicroOp.paddr + io.rdcache.s1_paddr_dup_lsu := dcacheReqPaddr + io.rdcache.s1_paddr_dup_dcache := dcacheReqPaddr io.rdcache.s1_kill := false.B io.rdcache.s1_kill_data_read := false.B io.rdcache.s2_kill := false.B @@ -568,21 +692,56 @@ class VSegmentUnit (implicit p: Parameters) extends VLSUModule io.rdcache.s2_pc := instMicroOp.uop.pc } io.rdcache.replacementUpdated := false.B - io.rdcache.is128Req := false.B + io.rdcache.is128Req := notCross16ByteReg /** * write data to sbuffer * */ + val sbufferAddrLow4bit = latchVaddr(3, 0) + + val notCross16BytePaddr = Cat(instMicroOp.paddr(instMicroOp.paddr.getWidth - 1, 4), 0.U(4.W)) + val notCross16ByteData = flowData << (sbufferAddrLow4bit << 3) + + val Cross16ByteMask = Wire(UInt(32.W)) + val Cross16ByteData = Wire(UInt(256.W)) + Cross16ByteMask := bmask << sbufferAddrLow4bit + Cross16ByteData := flowData << (sbufferAddrLow4bit << 3) + + val vaddrLow = Cat(latchVaddr(latchVaddr.getWidth - 1, 3), 0.U(3.W)) + val vaddrHigh = Cat(latchVaddr(latchVaddr.getWidth - 1, 3), 0.U(3.W)) + 8.U + + + val paddrLow = Cat(lowPagePaddr(lowPagePaddr.getWidth - 1, 3), 0.U(3.W)) + val paddrHigh = Cat(instMicroOp.paddr(instMicroOp.paddr.getWidth - 1, 3), 0.U(3.W)) + + val maskLow = Cross16ByteMask(15, 0) + val maskHigh = Cross16ByteMask(31, 16) + + val dataLow = Cross16ByteData(127, 0) + val dataHigh = Cross16ByteData(255, 128) + + val sbuffermisalignMask = Mux(notCross16ByteReg, wmask, Mux(isFirstSplit, maskLow, maskHigh)) + val sbuffermisalignData = Mux(notCross16ByteReg, notCross16ByteData, Mux(isFirstSplit, dataLow, dataHigh)) + val sbuffermisalignPaddr = Mux(notCross16ByteReg, notCross16BytePaddr, Mux(isFirstSplit, paddrLow, paddrHigh)) + val sbuffermisalignVaddr = Mux(notCross16ByteReg, notCross16ByteVaddr, Mux(isFirstSplit, vaddrLow, vaddrHigh)) + + val sbufferMask = Mux(isMisalignReg, sbuffermisalignMask, wmask) + val sbufferData = Mux(isMisalignReg, sbuffermisalignData, flowData) + val sbufferVaddr = Mux(isMisalignReg, sbuffermisalignVaddr, latchVaddr) + val sbufferPaddr = Mux(isMisalignReg, sbuffermisalignPaddr, instMicroOp.paddr) + + dontTouch(wmask) + dontTouch(Cross16ByteMask) sbufferOut.bits := DontCare sbufferOut.valid := state === s_send_data && segmentActive sbufferOut.bits.vecValid := state === s_send_data && segmentActive - sbufferOut.bits.mask := wmask - sbufferOut.bits.data := flowData - sbufferOut.bits.vaddr := latchVaddr + sbufferOut.bits.mask := sbufferMask + sbufferOut.bits.data := sbufferData + sbufferOut.bits.vaddr := sbufferVaddr sbufferOut.bits.cmd := MemoryOpConstants.M_XWR sbufferOut.bits.id := DontCare - sbufferOut.bits.addr := instMicroOp.paddr + sbufferOut.bits.addr := sbufferPaddr NewPipelineConnect( sbufferOut, io.sbuffer, io.sbuffer.fire, @@ -599,7 +758,7 @@ class VSegmentUnit (implicit p: Parameters) extends VLSUModule private val fieldActiveWirteFinish = sbufferOut.fire && segmentActive // writedata finish and is a active segment XSError(sbufferOut.fire && !segmentActive, "Attempt write inactive segment to sbuffer, something wrong!\n") - private val segmentInactiveFinish = ((state === s_latch_and_merge_data) || (state === s_send_data)) && !segmentActive + private val segmentInactiveFinish = ((state === s_latch_and_merge_data) || (state === s_send_data && stateNext =/= s_send_data)) && !segmentActive val splitPtrOffset = Mux( isIndexed(instType), @@ -623,7 +782,7 @@ class VSegmentUnit (implicit p: Parameters) extends VLSUModule } // update splitPtr - when(state === s_latch_and_merge_data || (state === s_send_data && (fieldActiveWirteFinish || !segmentActive))){ + when(state === s_latch_and_merge_data || (state === s_send_data && stateNext =/= s_send_data && (fieldActiveWirteFinish || !segmentActive))){ splitPtr := splitPtrNext }.elsewhen(io.in.fire && !instMicroOpValid){ splitPtr := deqPtr // initial splitPtr @@ -637,7 +796,7 @@ class VSegmentUnit (implicit p: Parameters) extends VLSUModule when(io.in.fire && !instMicroOpValid){ // init fieldIdx := 0.U }.elsewhen(state === s_latch_and_merge_data && segmentActive || - (state === s_send_data && fieldActiveWirteFinish)){ // only if segment is active + (state === s_send_data && stateNext =/= s_send_data && fieldActiveWirteFinish)){ // only if segment is active /* next segment, only if segment complete */ fieldIdx := Mux(fieldIdx === maxNfields, 0.U, fieldIdx + 1.U) @@ -647,7 +806,7 @@ class VSegmentUnit (implicit p: Parameters) extends VLSUModule //update segmentIdx when(io.in.fire && !instMicroOpValid){ segmentIdx := 0.U - }.elsewhen(fieldIdx === maxNfields && (state === s_latch_and_merge_data || (state === s_send_data && fieldActiveWirteFinish)) && + }.elsewhen(fieldIdx === maxNfields && (state === s_latch_and_merge_data || (state === s_send_data && stateNext =/= s_send_data && fieldActiveWirteFinish)) && segmentIdx =/= maxSegIdx){ // next segment, only if segment is active segmentIdx := segmentIdx + 1.U @@ -657,7 +816,7 @@ class VSegmentUnit (implicit p: Parameters) extends VLSUModule //update segmentOffset /* when segment is active or segment is inactive, increase segmentOffset */ - when((fieldIdx === maxNfields && (state === s_latch_and_merge_data || (state === s_send_data && fieldActiveWirteFinish))) || + when((fieldIdx === maxNfields && (state === s_latch_and_merge_data || (state === s_send_data && stateNext =/= s_send_data && fieldActiveWirteFinish))) || segmentInactiveFinish){ segmentOffset := segmentOffset + Mux(isUnitStride(issueInstType), (maxNfields +& 1.U) << issueEew(1, 0), stride(stridePtr.value)) diff --git a/src/main/scala/xiangshan/mem/vector/VSplit.scala b/src/main/scala/xiangshan/mem/vector/VSplit.scala index 4d973d7705..a5222cb38e 100644 --- a/src/main/scala/xiangshan/mem/vector/VSplit.scala +++ b/src/main/scala/xiangshan/mem/vector/VSplit.scala @@ -359,12 +359,13 @@ abstract class VSplitBuffer(isVStore: Boolean = false)(implicit p: Parameters) e val regOffset = getCheckAddrLowBits(issueUsLowBitsAddr, maxMemByteNum) // offset in 256-bits vd XSError((splitIdx > 1.U && usNoSplit) || (splitIdx > 1.U && !issuePreIsSplit) , "Unit-Stride addr split error!\n") + // no-unit-stride can trigger misalign val addrAligned = LookupTree(issueEew, List( - "b00".U -> true.B, //b - "b01".U -> (issueBaseAddr(0) === 0.U), //h - "b10".U -> (issueBaseAddr(1, 0) === 0.U), //w - "b11".U -> (issueBaseAddr(2, 0) === 0.U) //d - )) + "b00".U -> true.B, //b + "b01".U -> (vaddr(0) === 0.U), //h + "b10".U -> (vaddr(1, 0) === 0.U), //w + "b11".U -> (vaddr(2, 0) === 0.U) //d + )) || !issuePreIsSplit // data io.out.bits match { case x => @@ -392,7 +393,9 @@ abstract class VSplitBuffer(isVStore: Boolean = false)(implicit p: Parameters) e /* Execute logic */ /** Issue to scala pipeline**/ - val allowIssue = io.out.ready + + lazy val misalignedCanGo = true.B + val allowIssue = (addrAligned || misalignedCanGo) && io.out.ready val issueCount = Mux(usNoSplit, 2.U, (PopCount(inActiveIssue) + PopCount(activeIssue))) // for dont need split unit-stride, issue two flow splitFinish := splitIdx >= (issueFlowNum - issueCount) @@ -427,7 +430,7 @@ abstract class VSplitBuffer(isVStore: Boolean = false)(implicit p: Parameters) e } // out connect - io.out.valid := issueValid && (vecActive || !issuePreIsSplit) // TODO: inactive unit-stride uop do not send to pipeline + io.out.valid := issueValid && (vecActive || !issuePreIsSplit) && (addrAligned || misalignedCanGo) // TODO: inactive unit-stride uop do not send to pipeline XSPerfAccumulate("out_valid", io.out.valid) XSPerfAccumulate("out_fire", io.out.fire) @@ -437,6 +440,8 @@ abstract class VSplitBuffer(isVStore: Boolean = false)(implicit p: Parameters) e } class VSSplitBufferImp(implicit p: Parameters) extends VSplitBuffer(isVStore = true){ + override lazy val misalignedCanGo = io.vstdMisalign.get.storePipeEmpty && io.vstdMisalign.get.storeMisalignBufferEmpty + // split data val splitData = genVSData( data = issueEntry.data.asUInt, @@ -525,5 +530,7 @@ class VSSplitImp(implicit p: Parameters) extends VLSUModule{ splitBuffer.io.redirect <> io.redirect io.out <> splitBuffer.io.out io.vstd.get <> splitBuffer.io.vstd.get + + io.vstdMisalign.get <> splitBuffer.io.vstdMisalign.get } diff --git a/src/main/scala/xiangshan/mem/vector/VecBundle.scala b/src/main/scala/xiangshan/mem/vector/VecBundle.scala index d491cbae55..168ea657ec 100644 --- a/src/main/scala/xiangshan/mem/vector/VecBundle.scala +++ b/src/main/scala/xiangshan/mem/vector/VecBundle.scala @@ -218,12 +218,18 @@ class FeedbackToLsqIO(implicit p: Parameters) extends VLSUBundle{ def isLast = feedback(VecFeedbacks.LAST) } +class storeMisaignIO(implicit p: Parameters) extends Bundle{ + val storePipeEmpty = Input(Bool()) + val storeMisalignBufferEmpty = Input(Bool()) +} + class VSplitIO(isVStore: Boolean=false)(implicit p: Parameters) extends VLSUBundle{ val redirect = Flipped(ValidIO(new Redirect)) val in = Flipped(Decoupled(new MemExuInput(isVector = true))) // from iq val toMergeBuffer = new ToMergeBufferIO(isVStore) //to merge buffer req mergebuffer entry val out = Decoupled(new VecPipeBundle(isVStore))// to scala pipeline val vstd = OptionWrapper(isVStore, Valid(new MemExuOutput(isVector = true))) + val vstdMisalign = OptionWrapper(isVStore, new storeMisaignIO) } class VSplitPipelineIO(isVStore: Boolean=false)(implicit p: Parameters) extends VLSUBundle{ @@ -238,6 +244,7 @@ class VSplitBufferIO(isVStore: Boolean=false)(implicit p: Parameters) extends VL val in = Flipped(Decoupled(new VLSBundle())) val out = Decoupled(new VecPipeBundle(isVStore))//to scala pipeline val vstd = OptionWrapper(isVStore, ValidIO(new MemExuOutput(isVector = true))) + val vstdMisalign = OptionWrapper(isVStore, new storeMisaignIO) } class VMergeBufferIO(isVStore : Boolean=false)(implicit p: Parameters) extends VLSUBundle{ @@ -248,6 +255,8 @@ class VMergeBufferIO(isVStore : Boolean=false)(implicit p: Parameters) extends V val toSplit = if(isVStore) Vec(VecStorePipelineWidth, ValidIO(new FeedbackToSplitIO)) else Vec(VecLoadPipelineWidth, ValidIO(new FeedbackToSplitIO)) // for inorder inst val toLsq = if(isVStore) Vec(VSUopWritebackWidth, ValidIO(new FeedbackToLsqIO)) else Vec(VLUopWritebackWidth, ValidIO(new FeedbackToLsqIO)) // for lsq deq val feedback = if(isVStore) Vec(VSUopWritebackWidth, ValidIO(new RSFeedback(isVector = true))) else Vec(VLUopWritebackWidth, ValidIO(new RSFeedback(isVector = true)))//for rs replay + + val fromMisalignBuffer = OptionWrapper(isVStore, Flipped(new StoreMaBufToVecStoreMergeBufferIO)) } class VSegmentUnitIO(implicit p: Parameters) extends VLSUBundle{ From da6af8185fc2d8c3341c79f50cd134f32d0f1bc7 Mon Sep 17 00:00:00 2001 From: Anzooooo Date: Fri, 29 Nov 2024 13:20:41 +0800 Subject: [PATCH 3/4] fix(Sbuffer): remove address assert for 'io.dcache.req' This is a false assertion, and we should not prohibit 0 address requests --- src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala b/src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala index 31dbabdf6b..bd18c9627c 100644 --- a/src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala +++ b/src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala @@ -685,11 +685,6 @@ class Sbuffer(implicit p: Parameters) io.dcache.req.bits.mask := mask(sbuffer_out_s1_evictionIdx).asUInt io.dcache.req.bits.id := sbuffer_out_s1_evictionIdx - when (sbuffer_out_s1_fire) { - assert(!(io.dcache.req.bits.vaddr === 0.U)) - assert(!(io.dcache.req.bits.addr === 0.U)) - } - XSDebug(sbuffer_out_s1_fire, p"send buf [$sbuffer_out_s1_evictionIdx] to Dcache, req fire\n" ) From 80147fa58493b44d5f602217d45fd3e97f137799 Mon Sep 17 00:00:00 2001 From: Anzooooo Date: Sun, 1 Dec 2024 15:55:38 +0800 Subject: [PATCH 4/4] fix(LoadQueueReplay): use more precise conditions for unblocking `forwarding fault` It is not necessary to check whether the storequeue entry pointed to by sqidx is complete, because this entry is the store that follows this load. --- src/main/scala/xiangshan/mem/lsqueue/LoadQueueReplay.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/scala/xiangshan/mem/lsqueue/LoadQueueReplay.scala b/src/main/scala/xiangshan/mem/lsqueue/LoadQueueReplay.scala index f81ab9b0ae..9eab647d8d 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/LoadQueueReplay.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/LoadQueueReplay.scala @@ -305,8 +305,8 @@ class LoadQueueReplay(implicit p: Parameters) extends XSModule for (i <- 0 until LoadQueueReplaySize) { // dequeue // FIXME: store*Ptr is not accurate - dataNotBlockVec(i) := isAfter(io.stDataReadySqPtr, blockSqIdx(i)) || stDataReadyVec(blockSqIdx(i).value) || io.sqEmpty // for better timing - addrNotBlockVec(i) := isAfter(io.stAddrReadySqPtr, blockSqIdx(i)) || !strict(i) && stAddrReadyVec(blockSqIdx(i).value) || io.sqEmpty // for better timing + dataNotBlockVec(i) := isNotBefore(io.stDataReadySqPtr, blockSqIdx(i)) || stDataReadyVec(blockSqIdx(i).value) || io.sqEmpty // for better timing + addrNotBlockVec(i) := isNotBefore(io.stAddrReadySqPtr, blockSqIdx(i)) || !strict(i) && stAddrReadyVec(blockSqIdx(i).value) || io.sqEmpty // for better timing // store address execute storeAddrInSameCycleVec(i) := VecInit((0 until StorePipelineWidth).map(w => { io.storeAddrIn(w).valid &&