diff --git a/src/main/scala/xiangshan/Bundle.scala b/src/main/scala/xiangshan/Bundle.scala index fa473424a2..4ec7fee508 100644 --- a/src/main/scala/xiangshan/Bundle.scala +++ b/src/main/scala/xiangshan/Bundle.scala @@ -351,9 +351,12 @@ class ResetPregStateReq(implicit p: Parameters) extends XSBundle { class DebugBundle(implicit p: Parameters) extends XSBundle { val isMMIO = Bool() + val isNC = Bool() val isPerfCnt = Bool() val paddr = UInt(PAddrBits.W) val vaddr = UInt(VAddrBits.W) + + def isSkipDiff: Bool = isMMIO || isNC || isPerfCnt /* add L/S inst info in EXU */ // val L1toL2TlbLatency = UInt(XLEN.W) // val levelTlbHit = UInt(2.W) diff --git a/src/main/scala/xiangshan/XSCore.scala b/src/main/scala/xiangshan/XSCore.scala index b52ba4bcf5..5d99e3598b 100644 --- a/src/main/scala/xiangshan/XSCore.scala +++ b/src/main/scala/xiangshan/XSCore.scala @@ -208,7 +208,7 @@ class XSCoreImp(outer: XSCoreBase) extends LazyModuleImp(outer) memBlock.io.ooo_to_mem.tlbCsr := backend.io.mem.tlbCsr memBlock.io.ooo_to_mem.lsqio.lcommit := backend.io.mem.robLsqIO.lcommit memBlock.io.ooo_to_mem.lsqio.scommit := backend.io.mem.robLsqIO.scommit - memBlock.io.ooo_to_mem.lsqio.pendingUncacheld := backend.io.mem.robLsqIO.pendingUncacheld + memBlock.io.ooo_to_mem.lsqio.pendingMMIOld := backend.io.mem.robLsqIO.pendingMMIOld memBlock.io.ooo_to_mem.lsqio.pendingld := backend.io.mem.robLsqIO.pendingld memBlock.io.ooo_to_mem.lsqio.pendingst := backend.io.mem.robLsqIO.pendingst memBlock.io.ooo_to_mem.lsqio.pendingVst := backend.io.mem.robLsqIO.pendingVst diff --git a/src/main/scala/xiangshan/backend/MemBlock.scala b/src/main/scala/xiangshan/backend/MemBlock.scala index 0dde7f92c8..67720b0c51 100644 --- a/src/main/scala/xiangshan/backend/MemBlock.scala +++ b/src/main/scala/xiangshan/backend/MemBlock.scala @@ -66,6 +66,7 @@ trait HasMemBlockParameters extends HasXSParameter { val AtomicWBPort = 0 val MisalignWBPort = 1 val UncacheWBPort = 2 + val NCWBPorts = Seq(1, 2) } abstract class MemBlockBundle(implicit val p: Parameters) extends Bundle with HasMemBlockParameters @@ -89,7 +90,7 @@ class ooo_to_mem(implicit p: Parameters) extends MemBlockBundle { val lsqio = new Bundle { val lcommit = Input(UInt(log2Up(CommitWidth + 1).W)) val scommit = Input(UInt(log2Up(CommitWidth + 1).W)) - val pendingUncacheld = Input(Bool()) + val pendingMMIOld = Input(Bool()) val pendingld = Input(Bool()) val pendingst = Input(Bool()) val pendingVst = Input(Bool()) @@ -820,6 +821,7 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer) // forward loadUnits(i).io.lsq.forward <> lsq.io.forward(i) loadUnits(i).io.sbuffer <> sbuffer.io.forward(i) + loadUnits(i).io.ubuffer <> uncache.io.forward(i) loadUnits(i).io.tl_d_channel := dcache.io.lsu.forward_D(i) loadUnits(i).io.forward_mshr <> dcache.io.lsu.forward_mshr(i) // ld-ld violation check @@ -905,6 +907,7 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer) loadUnits(i).io.lsq.uncache.bits := DontCare } lsq.io.ld_raw_data(i) <> loadUnits(i).io.lsq.ld_raw_data + lsq.io.ncOut(i) <> loadUnits(i).io.lsq.nc_ldin lsq.io.l2_hint.valid := l2_hint.valid lsq.io.l2_hint.bits.sourceId := l2_hint.bits.sourceId lsq.io.l2_hint.bits.isKeyword := l2_hint.bits.isKeyword @@ -962,6 +965,7 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer) hybridUnits(i).io.ldu_io.lsq.forward <> lsq.io.forward(LduCnt + i) // forward hybridUnits(i).io.ldu_io.sbuffer <> sbuffer.io.forward(LduCnt + i) + hybridUnits(i).io.ldu_io.ubuffer <> uncache.io.forward(LduCnt + i) // hybridUnits(i).io.ldu_io.vec_forward <> vsFlowQueue.io.forward(LduCnt + i) hybridUnits(i).io.ldu_io.vec_forward := DontCare hybridUnits(i).io.ldu_io.tl_d_channel := dcache.io.lsu.forward_D(LduCnt + i) @@ -1035,6 +1039,8 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer) // passdown to lsq (load s2) + hybridUnits(i).io.ldu_io.lsq.nc_ldin.valid := false.B + hybridUnits(i).io.ldu_io.lsq.nc_ldin.bits := DontCare lsq.io.ldu.ldin(LduCnt + i) <> hybridUnits(i).io.ldu_io.lsq.ldin // Lsq to sta unit lsq.io.sta.storeMaskIn(StaCnt + i) <> hybridUnits(i).io.stu_io.st_mask_out @@ -1074,7 +1080,7 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer) loadMisalignBuffer.io.redirect <> redirect loadMisalignBuffer.io.rob.lcommit := io.ooo_to_mem.lsqio.lcommit loadMisalignBuffer.io.rob.scommit := io.ooo_to_mem.lsqio.scommit - loadMisalignBuffer.io.rob.pendingUncacheld := io.ooo_to_mem.lsqio.pendingUncacheld + loadMisalignBuffer.io.rob.pendingMMIOld := io.ooo_to_mem.lsqio.pendingMMIOld loadMisalignBuffer.io.rob.pendingld := io.ooo_to_mem.lsqio.pendingld loadMisalignBuffer.io.rob.pendingst := io.ooo_to_mem.lsqio.pendingst loadMisalignBuffer.io.rob.pendingVst := io.ooo_to_mem.lsqio.pendingVst @@ -1087,7 +1093,7 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer) storeMisalignBuffer.io.redirect <> redirect storeMisalignBuffer.io.rob.lcommit := io.ooo_to_mem.lsqio.lcommit storeMisalignBuffer.io.rob.scommit := io.ooo_to_mem.lsqio.scommit - storeMisalignBuffer.io.rob.pendingUncacheld := io.ooo_to_mem.lsqio.pendingUncacheld + storeMisalignBuffer.io.rob.pendingMMIOld := io.ooo_to_mem.lsqio.pendingMMIOld storeMisalignBuffer.io.rob.pendingld := io.ooo_to_mem.lsqio.pendingld storeMisalignBuffer.io.rob.pendingst := io.ooo_to_mem.lsqio.pendingst storeMisalignBuffer.io.rob.pendingVst := io.ooo_to_mem.lsqio.pendingVst @@ -1276,7 +1282,7 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer) io.mem_to_ooo.lsqio.uop := lsq.io.rob.uop lsq.io.rob.lcommit := io.ooo_to_mem.lsqio.lcommit lsq.io.rob.scommit := io.ooo_to_mem.lsqio.scommit - lsq.io.rob.pendingUncacheld := io.ooo_to_mem.lsqio.pendingUncacheld + lsq.io.rob.pendingMMIOld := io.ooo_to_mem.lsqio.pendingMMIOld lsq.io.rob.pendingld := io.ooo_to_mem.lsqio.pendingld lsq.io.rob.pendingst := io.ooo_to_mem.lsqio.pendingst lsq.io.rob.pendingVst := io.ooo_to_mem.lsqio.pendingVst @@ -1298,7 +1304,7 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer) )).andR)) resultOnehot } - val allRedirect = loadUnits.map(_.io.rollback) ++ hybridUnits.map(_.io.ldu_io.rollback) ++ Seq(lsq.io.nack_rollback) ++ lsq.io.nuke_rollback + val allRedirect = loadUnits.map(_.io.rollback) ++ hybridUnits.map(_.io.ldu_io.rollback) ++ lsq.io.nack_rollback ++ lsq.io.nuke_rollback val oldestOneHot = selectOldestRedirect(allRedirect) val oldestRedirect = WireDefault(Mux1H(oldestOneHot, allRedirect)) // memory replay would not cause IAF/IPF/IGPF @@ -1329,8 +1335,7 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer) is (s_idle) { when (uncacheReq.fire) { when (lsq.io.uncache.req.valid) { - val isStore = lsq.io.uncache.req.bits.cmd === MemoryOpConstants.M_XWR - when (!isStore || !io.ooo_to_mem.csrCtrl.uncache_write_outstanding_enable) { + when (!lsq.io.uncache.req.bits.nc || !io.ooo_to_mem.csrCtrl.uncache_write_outstanding_enable) { uncacheState := s_scalar_uncache } }.otherwise { diff --git a/src/main/scala/xiangshan/backend/rob/Rob.scala b/src/main/scala/xiangshan/backend/rob/Rob.scala index 510285e55b..8555f27a7b 100644 --- a/src/main/scala/xiangshan/backend/rob/Rob.scala +++ b/src/main/scala/xiangshan/backend/rob/Rob.scala @@ -530,7 +530,7 @@ class RobImp(override val wrapper: Rob)(implicit p: Parameters, params: BackendP XSInfo(true.B, p"writebacked pc 0x${Hexadecimal(debug_Uop.pc)} wen ${debug_Uop.rfWen} " + p"data 0x${Hexadecimal(wb.bits.data(0))} ldst ${debug_Uop.ldest} pdst ${debug_Uop.pdest} " + - p"skip ${wb.bits.debug.isMMIO} robIdx: ${wb.bits.robIdx}\n" + p"skip ${wb.bits.debug.isSkipDiff} robIdx: ${wb.bits.robIdx}\n" ) } } @@ -799,7 +799,7 @@ class RobImp(override val wrapper: Rob)(implicit p: Parameters, params: BackendP io.lsq.lcommit := RegNext(Mux(io.commits.isCommit, PopCount(ldCommitVec), 0.U)) io.lsq.scommit := RegNext(Mux(io.commits.isCommit, PopCount(stCommitVec), 0.U)) // indicate a pending load or store - io.lsq.pendingUncacheld := RegNext(io.commits.isCommit && io.commits.info(0).commitType === CommitType.LOAD && robEntries(deqPtr.value).valid && robEntries(deqPtr.value).mmio) + io.lsq.pendingMMIOld := RegNext(io.commits.isCommit && io.commits.info(0).commitType === CommitType.LOAD && robEntries(deqPtr.value).valid && robEntries(deqPtr.value).mmio) io.lsq.pendingld := RegNext(io.commits.isCommit && io.commits.info(0).commitType === CommitType.LOAD && robEntries(deqPtr.value).valid) // TODO: Check if need deassert pendingst when it is vst io.lsq.pendingst := RegNext(io.commits.isCommit && io.commits.info(0).commitType === CommitType.STORE && robEntries(deqPtr.value).valid) @@ -1436,7 +1436,7 @@ class RobImp(override val wrapper: Rob)(implicit p: Parameters, params: BackendP val isRVC = dt_isRVC(ptr) val difftest = DifftestModule(new DiffInstrCommit(MaxPhyRegs), delay = 3, dontCare = true) - val dt_skip = Mux(eliminatedMove, false.B, exuOut.isMMIO || exuOut.isPerfCnt) + val dt_skip = Mux(eliminatedMove, false.B, exuOut.isSkipDiff) difftest.coreid := io.hartId difftest.index := i.U difftest.valid := io.commits.commitValid(i) && io.commits.isCommit diff --git a/src/main/scala/xiangshan/backend/rob/RobBundles.scala b/src/main/scala/xiangshan/backend/rob/RobBundles.scala index ac1dc82b76..5292f68510 100644 --- a/src/main/scala/xiangshan/backend/rob/RobBundles.scala +++ b/src/main/scala/xiangshan/backend/rob/RobBundles.scala @@ -231,7 +231,7 @@ class RobCSRIO(implicit p: Parameters) extends XSBundle { class RobLsqIO(implicit p: Parameters) extends XSBundle { val lcommit = Output(UInt(log2Up(CommitWidth + 1).W)) val scommit = Output(UInt(log2Up(CommitWidth + 1).W)) - val pendingUncacheld = Output(Bool()) + val pendingMMIOld = Output(Bool()) val pendingld = Output(Bool()) val pendingst = Output(Bool()) // set when vector store at the head of ROB diff --git a/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala b/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala index 671cac29af..4a87a11dcc 100644 --- a/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala +++ b/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala @@ -515,11 +515,13 @@ class UncacheWordReq(implicit p: Parameters) extends DCacheBundle { val cmd = UInt(M_SZ.W) val addr = UInt(PAddrBits.W) + val vaddr = UInt(VAddrBits.W) // for uncache buffer forwarding val data = UInt(XLEN.W) val mask = UInt((XLEN/8).W) val id = UInt(uncacheIdxBits.W) val instrtype = UInt(sourceTypeWidth.W) val atomic = Bool() + val nc = Bool() val isFirstIssue = Bool() val replayCarry = new ReplayCarry(nWays) @@ -533,7 +535,9 @@ class UncacheWordResp(implicit p: Parameters) extends DCacheBundle { val data = UInt(XLEN.W) val data_delayed = UInt(XLEN.W) - val id = UInt(uncacheIdxBits.W) + val id = UInt(uncacheIdxBits.W) // resp identified signals + val nc = Bool() // resp identified signals + val is2lq = Bool() // resp identified signals val miss = Bool() val replay = Bool() val tag_error = Bool() diff --git a/src/main/scala/xiangshan/cache/dcache/Uncache.scala b/src/main/scala/xiangshan/cache/dcache/Uncache.scala index 8d0e878c49..0bc016b34f 100644 --- a/src/main/scala/xiangshan/cache/dcache/Uncache.scala +++ b/src/main/scala/xiangshan/cache/dcache/Uncache.scala @@ -22,156 +22,114 @@ import org.chipsalliance.cde.config.Parameters import utils._ import utility._ import xiangshan._ +import xiangshan.mem._ import freechips.rocketchip.diplomacy.{IdRange, LazyModule, LazyModuleImp, TransferSizes} import freechips.rocketchip.tilelink.{TLArbiter, TLBundleA, TLBundleD, TLClientNode, TLEdgeOut, TLMasterParameters, TLMasterPortParameters} -class UncachePtr(implicit p: Parameters) extends CircularQueuePtr[UncachePtr]( - p => p(XSCoreParamsKey).UncacheBufferSize -){ - -} - -object UncachePtr { - def apply(f: Bool, v: UInt)(implicit p: Parameters): UncachePtr = { - val ptr = Wire(new UncachePtr) - ptr.flag := f - ptr.value := v - ptr - } -} - class UncacheFlushBundle extends Bundle { val valid = Output(Bool()) val empty = Input(Bool()) } -// One miss entry deals with one mmio request -class MMIOEntry(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule -{ - val io = IO(new Bundle { - // MSHR ID - val hartId = Input(UInt()) - // Control IO - val enableOutstanding = Input(Bool()) - - // Client requests - val req = Flipped(DecoupledIO(new UncacheWordReq)) - val resp = DecoupledIO(new DCacheWordRespWithError) - - // TileLink - val mem_acquire = DecoupledIO(new TLBundleA(edge.bundle)) - val mem_grant = Flipped(DecoupledIO(new TLBundleD(edge.bundle))) - - // This entry is valid. - val invalid = Output(Bool()) - // This entry is selected. - val select = Input(Bool()) - val atomic = Output(Bool()) - }) - // ================================================ - // FSM state description: - // s_invalid : Entry is invalid. - // s_refill_req : Send Acquire request. - // s_refill_resp : Wait for Grant response. - // s_send_resp : Send Uncache response. - val s_invalid :: s_refill_req :: s_refill_resp :: s_send_resp :: Nil = Enum(4) - val state = RegInit(s_invalid) - - val req = Reg(new UncacheWordReq) - val resp_data = Reg(UInt(DataBits.W)) - val resp_nderr = Reg(Bool()) - def storeReq = req.cmd === MemoryOpConstants.M_XWR - - io.invalid := state === s_invalid - // Assign default values to output signals. - io.req.ready := false.B - io.resp.valid := false.B - io.resp.bits := DontCare - - io.mem_acquire.valid := false.B - io.mem_acquire.bits := DontCare - io.mem_grant.ready := false.B - - io.atomic := req.atomic - // Receive request - when (state === s_invalid) { - io.req.ready := true.B - - when (io.req.fire) { - req := io.req.bits - req.addr := io.req.bits.addr - resp_nderr := false.B - state := s_refill_req - } +class UncacheEntry(implicit p: Parameters) extends DCacheBundle { + val cmd = UInt(M_SZ.W) + val addr = UInt(PAddrBits.W) + val vaddr = UInt(VAddrBits.W) + val data = UInt(XLEN.W) + val mask = UInt(DataBytes.W) + val id = UInt(uncacheIdxBits.W) + val nc = Bool() + val atomic = Bool() + + val resp_nderr = Bool() + + /* NOTE: if it support the internal forward logic, here can uncomment */ + // val fwd_data = UInt(XLEN.W) + // val fwd_mask = UInt(DataBytes.W) + + def set(x: UncacheWordReq): Unit = { + cmd := x.cmd + addr := x.addr + vaddr := x.vaddr + data := x.data + mask := x.mask + id := x.id + nc := x.nc + atomic := x.atomic + resp_nderr := false.B + // fwd_data := 0.U + // fwd_mask := 0.U } - // Refill - // TODO: determine 'lgSize' in memend - val size = PopCount(req.mask) - val (lgSize, legal) = PriorityMuxWithFlag(Seq( - 1.U -> 0.U, - 2.U -> 1.U, - 4.U -> 2.U, - 8.U -> 3.U - ).map(m => (size===m._1) -> m._2)) - assert(!(io.mem_acquire.valid && !legal)) - - val load = edge.Get( - fromSource = io.hartId, - toAddress = req.addr, - lgSize = lgSize - )._2 - - val store = edge.Put( - fromSource = io.hartId, - toAddress = req.addr, - lgSize = lgSize, - data = req.data, - mask = req.mask - )._2 - - XSDebug("entry: %d state: %d\n", io.hartId, state) - - when (state === s_refill_req) { - io.mem_acquire.valid := true.B && io.select - io.mem_acquire.bits := Mux(storeReq, store, load) - - when (io.mem_acquire.fire) { - state := s_refill_resp + def update(x: TLBundleD): Unit = { + when(cmd === MemoryOpConstants.M_XRD) { + data := x.data } + resp_nderr := x.denied } - val (_, _, refill_done, _) = edge.addr_inc(io.mem_grant) - when (state === s_refill_resp) { - io.mem_grant.ready := true.B - - when (io.mem_grant.fire) { - resp_data := io.mem_grant.bits.data - resp_nderr := io.mem_grant.bits.denied - // TODO: consider corrupt - assert(refill_done, "Uncache response should be one beat only!") - state := Mux(storeReq && io.enableOutstanding, s_invalid, s_send_resp) - } + // def update(forwardData: UInt, forwardMask: UInt): Unit = { + // fwd_data := forwardData + // fwd_mask := forwardMask + // } + + def toUncacheWordResp(): UncacheWordResp = { + // val resp_fwd_data = VecInit((0 until DataBytes).map(j => + // Mux(fwd_mask(j), fwd_data(8*(j+1)-1, 8*j), data(8*(j+1)-1, 8*j)) + // )).asUInt + val resp_fwd_data = data + val r = Wire(new UncacheWordResp) + r := DontCare + r.data := resp_fwd_data + r.id := id + r.nderr := resp_nderr + r.nc := nc + r.is2lq := cmd === MemoryOpConstants.M_XRD + r.miss := false.B + r.replay := false.B + r.tag_error := false.B + r.error := false.B + r } +} - // Response - when (state === s_send_resp) { - io.resp.valid := true.B - io.resp.bits.data := resp_data - // meta data should go with the response - io.resp.bits.id := req.id - io.resp.bits.miss := false.B - io.resp.bits.replay := false.B - io.resp.bits.tag_error := false.B - io.resp.bits.error := false.B - io.resp.bits.nderr := resp_nderr - - when (io.resp.fire) { - state := s_invalid - } +class UncacheEntryState(implicit p: Parameters) extends DCacheBundle { + // valid (-> waitSame) -> inflight -> waitReturn + val valid = Bool() + val inflight = Bool() // uncache -> L2 + val waitSame = Bool() + val waitReturn = Bool() // uncache -> LSQ + + def init: Unit = { + valid := false.B + inflight := false.B + waitSame := false.B + waitReturn := false.B } - // End + def isValid(): Bool = valid + def isInflight(): Bool = inflight + def isWaitReturn(): Bool = waitReturn + def isWaitSame(): Bool = waitSame + def can2Uncache(): Bool = valid && !inflight && !waitSame && !waitReturn + def can2Lsq(): Bool = valid && waitReturn + + def setValid(x: Bool): Unit = { valid := x} + def setInflight(x: Bool): Unit = { inflight := x} + def setWaitReturn(x: Bool): Unit = { waitReturn := x } + def setWaitSame(x: Bool): Unit = { waitSame := x} + + def updateUncacheResp(): Unit = { + assert(inflight, "The request was not sent and a response was received") + inflight := false.B + waitReturn := true.B + } + def updateReturn(): Unit = { + valid := false.B + inflight := false.B + waitSame := false.B + waitReturn := false.B + } } class UncacheIO(implicit p: Parameters) extends DCacheBundle { @@ -179,6 +137,7 @@ class UncacheIO(implicit p: Parameters) extends DCacheBundle { val enableOutstanding = Input(Bool()) val flush = Flipped(new UncacheFlushBundle) val lsq = Flipped(new UncacheWordIO) + val forward = Vec(LoadPipelineWidth, Flipped(new LoadForwardQueryIO)) } // convert DCacheIO to TileLink @@ -199,12 +158,15 @@ class Uncache()(implicit p: Parameters) extends LazyModule with HasXSParameter { lazy val module = new UncacheImp(this) } +/* Uncache Buffer */ class UncacheImp(outer: Uncache)extends LazyModuleImp(outer) with HasTLDump with HasXSParameter with HasPerfEvents { - val io = IO(new UncacheIO) + private val INDEX_WIDTH = log2Up(UncacheBufferSize) + println(s"Uncahe Buffer Size: $UncacheBufferSize entries") + val io = IO(new UncacheIO) val (bus, edge) = outer.clientNode.out.head @@ -212,9 +174,7 @@ class UncacheImp(outer: Uncache)extends LazyModuleImp(outer) val resp = io.lsq.resp val mem_acquire = bus.a val mem_grant = bus.d - val req_ready = WireInit(false.B) - val need_fence = WireInit(false.B) // assign default values to output signals bus.b.ready := false.B @@ -223,187 +183,304 @@ class UncacheImp(outer: Uncache)extends LazyModuleImp(outer) bus.d.ready := false.B bus.e.valid := false.B bus.e.bits := DontCare - - val enqPtr = RegInit(0.U.asTypeOf(new UncachePtr)) - val issPtr = RegInit(0.U.asTypeOf(new UncachePtr)) - val deqPtr = RegInit(0.U.asTypeOf(new UncachePtr)) - val fence = RegInit(Bool(), false.B) - + io.lsq.req.ready := req_ready io.lsq.resp.valid := false.B io.lsq.resp.bits := DontCare - val entries = Seq.fill(UncacheBufferSize) { Module(new MMIOEntry(edge)) } - for ((entry, i) <- entries.zipWithIndex) { - entry.io.hartId := io.hartId - entry.io.enableOutstanding := io.enableOutstanding - // Enqueue - entry.io.req.valid := (i.U === enqPtr.value) && req.valid - entry.io.req.bits := req.bits + /****************************************************************** + * Data Structure + ******************************************************************/ - when (i.U === enqPtr.value) { - req_ready := entry.io.req.ready + val entries = Reg(Vec(UncacheBufferSize, new UncacheEntry)) + val states = RegInit(VecInit(Seq.fill(UncacheBufferSize)(0.U.asTypeOf(new UncacheEntryState)))) + val fence = RegInit(Bool(), false.B) + val s_idle :: s_refill_req :: s_refill_resp :: s_send_resp :: Nil = Enum(4) + val uState = RegInit(s_idle) + + def sizeMap[T <: Data](f: Int => T) = VecInit((0 until UncacheBufferSize).map(f)) + def isStore(e: UncacheEntry): Bool = e.cmd === MemoryOpConstants.M_XWR + def isStore(x: UInt): Bool = x === MemoryOpConstants.M_XWR + def addrMatch(x: UncacheEntry, y: UncacheWordReq): Bool = x.addr(PAddrBits - 1, 3) === y.addr(PAddrBits - 1, 3) + def addrMatch(x: UncacheWordReq, y: UncacheEntry): Bool = x.addr(PAddrBits - 1, 3) === y.addr(PAddrBits - 1, 3) + def addrMatch(x: UncacheEntry, y: UncacheEntry): Bool = x.addr(PAddrBits - 1, 3) === y.addr(PAddrBits - 1, 3) + def addrMatch(x: UInt, y: UInt): Bool = x(PAddrBits - 1, 3) === y(PAddrBits - 1, 3) + + // drain buffer + val empty = Wire(Bool()) + val f1_needDrain = Wire(Bool()) + val do_uarch_drain = RegNext(f1_needDrain) + + val q0_entry = Wire(new UncacheEntry) + val q0_canSentIdx = Wire(UInt(INDEX_WIDTH.W)) + val q0_canSent = Wire(Bool()) + + + /****************************************************************** + * uState for non-outstanding + ******************************************************************/ + + switch(uState){ + is(s_idle){ + when(req.fire){ + uState := s_refill_req + } + } + is(s_refill_req){ + when(mem_acquire.fire){ + uState := s_refill_resp + } + } + is(s_refill_resp){ + when(mem_grant.fire){ + uState := s_send_resp + } } + is(s_send_resp){ + when(resp.fire){ + uState := s_idle + } + } + } - // Acquire - entry.io.select := (i.U === issPtr.value) && Mux(entry.io.atomic, issPtr.value === deqPtr.value, !fence) - when (i.U === issPtr.value) { - need_fence := entry.io.atomic + /****************************************************************** + * Enter Buffer + * Version 0 (better timing) + * e0 judge: alloc/merge write vec + * e1 alloc + * + * Version 1 (better performance) + * solved in one cycle for achieving the original performance. + ******************************************************************/ + + /** + TODO lyq: how to merge + 1. same addr + 2. same cmd + 3. valid + FIXME lyq: not merge now due to the following issues + 1. load cann't be merged + 2. how to merge store and response precisely + */ + + val e0_fire = req.fire + val e0_req_valid = req.valid + val e0_req = req.bits + /** + TODO lyq: block or wait or forward? + NOW: strict block by same address; otherwise: exhaustive consideration is needed. + - ld->ld wait + - ld->st forward + - st->ld forward + - st->st block + */ + val e0_existSame = sizeMap(j => e0_req_valid && states(j).isValid() && addrMatch(e0_req, entries(j))).asUInt.orR + val e0_invalidVec = sizeMap(i => !states(i).isValid()) + val (e0_allocIdx, e0_canAlloc) = PriorityEncoderWithFlag(e0_invalidVec) + val e0_alloc = e0_canAlloc && !e0_existSame && e0_fire + req_ready := e0_invalidVec.asUInt.orR && !e0_existSame && !do_uarch_drain + + when (e0_alloc) { + entries(e0_allocIdx).set(e0_req) + states(e0_allocIdx).setValid(true.B) + + // judge whether wait same block: e0 & q0 + val waitSameVec = sizeMap(j => + e0_req_valid && states(j).isValid() && states(j).isInflight() && addrMatch(e0_req, entries(j)) + ) + val waitQ0 = q0_canSent && addrMatch(e0_req, q0_entry) + when (waitSameVec.reduce(_ || _) || waitQ0) { + states(e0_allocIdx).setWaitSame(true.B) } - // Grant - entry.io.mem_grant.valid := false.B - entry.io.mem_grant.bits := DontCare - when (i.U === deqPtr.value) { - entry.io.mem_grant <> mem_grant - } + } - entry.io.resp.ready := false.B - when (i.U === deqPtr.value) { - io.lsq.resp <> entry.io.resp - } + /****************************************************************** + * Uncache Req + * Version 0 (better timing) + * q0: choose which one is sent + * q0: sent + * + * Version 1 (better performance) + * solved in one cycle for achieving the original performance. + * NOTE: "Enter Buffer" & "Uncache Req" not a continuous pipeline, + * because there is no guarantee that mem_aquire will be always ready. + ******************************************************************/ + + val q0_canSentVec = sizeMap(i => + (io.enableOutstanding || uState === s_refill_req) && + states(i).can2Uncache() + ) + val q0_res = PriorityEncoderWithFlag(q0_canSentVec) + q0_canSentIdx := q0_res._1 + q0_canSent := q0_res._2 + q0_entry := entries(q0_canSentIdx) + + val size = PopCount(q0_entry.mask) + val (lgSize, legal) = PriorityMuxWithFlag(Seq( + 1.U -> 0.U, + 2.U -> 1.U, + 4.U -> 2.U, + 8.U -> 3.U + ).map(m => (size===m._1) -> m._2)) + assert(!(q0_canSent && !legal)) + + val q0_load = edge.Get( + fromSource = q0_canSentIdx, + toAddress = q0_entry.addr, + lgSize = lgSize + )._2 + + val q0_store = edge.Put( + fromSource = q0_canSentIdx, + toAddress = q0_entry.addr, + lgSize = lgSize, + data = q0_entry.data, + mask = q0_entry.mask + )._2 + + val q0_isStore = q0_entry.cmd === MemoryOpConstants.M_XWR + + mem_acquire.valid := q0_canSent + mem_acquire.bits := Mux(q0_isStore, q0_store, q0_load) + when(mem_acquire.fire){ + states(q0_canSentIdx).setInflight(true.B) + + // q0 should judge whether wait same block + (0 until UncacheBufferSize).map(j => + when(states(j).isValid() && !states(j).isWaitReturn() && addrMatch(q0_entry, entries(j))){ + states(j).setWaitSame(true.B) + } + ) } - io.lsq.req.ready := req_ready - when (io.enableOutstanding) { - // Uncache Buffer is a circular queue, which contains UncacheBufferSize entries. - // Description: - // enqPtr: Point to an invalid (means that the entry is free) entry. - // issPtr: Point to a ready entry, the entry is ready to issue. - // deqPtr: Point to the oldest entry, which was issued but has not accepted response (used to keep order with the program order). - // - // When outstanding disabled, only one read/write request can be accepted at a time. - // - // Example (Enable outstanding): - // 1. enqPtr: - // 1) Before enqueue - // enqPtr -- - // | - // | - // V - // +--+--+--+--+ - // | | | | | - // | | | | | - // | | | | | - // +--+--+--+--+ - // - // 2) After - // enqPtr+1 --- - // | - // | - // V - // +--+--+--+--+ - // | | | | | - // | | | | | - // | | | | | - // +--+--+--+--+ - // - // 2. issPtr: - // 1) Before issue - // issPtr -- - // | - // | - // V - // +--+--+--+--+ - // | | | | | - // | | | | | - // | | | | | - // +--+--+--+--+ - // - // 2) After issue - // issPtr+1 -- - // | - // | - // V - // +--+--+--+--+ - // | | | | | - // | | | | | - // | | | | | - // +--+--+--+--+ - // - // 3. deqPtr: - // 1) Before dequeue - // deqPtr -- - // | - // | - // V - // +--+--+--+--+ - // | | | | | - // | | | | | - // | | | | | - // +--+--+--+--+ - // - // 2) After dequeue - // deqPtr -- deqPtr+1 -- - // | | - // | | - // V V - // +--+--+--+--+ or +--+--+--+--+ - // | | | | | | | | | | - // | | | | | | | | | | - // | | | | | | | | | | - // +--+--+--+--+ +--+--+--+--+ - // (load) (store) - // - // 3) After response - // deqPtr+1 --- deqPtr-- - // | | - // | | - // V V - // +--+--+--+--+ or +--+--+--+--+ - // | | | | | | | | | | - // | | | | | | | | | | - // | | | | | | | | | | - // +--+--+--+--+ +--+--+--+--+ - // (load) (store) - // - - // Enqueue - when (req.fire) { - enqPtr := enqPtr + 1.U - } - // Issue - when (mem_acquire.fire) { - issPtr := issPtr + 1.U - } + /****************************************************************** + * Uncache Resp + ******************************************************************/ - when (mem_acquire.fire) { - fence := need_fence - } + val (_, _, refill_done, _) = edge.addr_inc(mem_grant) - // Dequeue - when (mem_grant.fire) { - deqPtr := Mux(edge.hasData(mem_grant.bits), deqPtr /* Load */, deqPtr + 1.U /* Store */) - } .elsewhen (io.lsq.resp.fire /* Load */) { - deqPtr := deqPtr + 1.U - } + mem_grant.ready := true.B + when (mem_grant.fire) { + val id = mem_grant.bits.source + entries(id).update(mem_grant.bits) + states(id).updateUncacheResp() + assert(refill_done, "Uncache response should be one beat only!") + + // remove state of wait same block + (0 until UncacheBufferSize).map(j => + when(states(j).isValid() && states(j).isWaitSame() && addrMatch(entries(id), entries(j))){ + states(j).setWaitSame(false.B) + } + ) + } - when (mem_grant.fire && fence) { - fence := false.B + + /****************************************************************** + * Return to LSQ + ******************************************************************/ + + val r0_canSentVec = sizeMap(i => states(i).can2Lsq()) + val (r0_canSentIdx, r0_canSent) = PriorityEncoderWithFlag(r0_canSentVec) + resp.valid := r0_canSent + resp.bits := entries(r0_canSentIdx).toUncacheWordResp() + when(resp.fire){ + states(r0_canSentIdx).updateReturn() + } + + + /****************************************************************** + * Buffer Flush + * 1. when io.flush.valid is true: drain store queue and ubuffer + * 2. when io.lsq.req.bits.atomic is true: not support temporarily + ******************************************************************/ + empty := !VecInit(states.map(_.isValid())).asUInt.orR + io.flush.empty := empty + + + /****************************************************************** + * Load Data Forward + * + * 0. ld in ldu pipeline + * f0: vaddr match, mask & data select, fast resp + * f1: paddr match, resp + * + * 1. ld in buffer (in "Enter Buffer") + * ld(en) -> st(in): ld entry.update, state.updateUncacheResp + * st(en) -> ld(in): ld entry.update, state.updateUncacheResp + * NOW: strict block by same address; there is no such forward. + * + ******************************************************************/ + + val f0_validMask = sizeMap(i => isStore(entries(i)) && states(i).isValid()) + val f0_fwdMaskCandidates = VecInit(entries.map(e => e.mask)) + val f0_fwdDataCandidates = VecInit(entries.map(e => e.data)) + val f1_tagMismatchVec = Wire(Vec(LoadPipelineWidth, Bool())) + f1_needDrain := f1_tagMismatchVec.asUInt.orR && !empty + + for ((forward, i) <- io.forward.zipWithIndex) { + val f0_fwdValid = forward.valid + val f1_fwdValid = RegNext(f0_fwdValid) + + // f0 vaddr match + val f0_vtagMatches = sizeMap(w => addrMatch(entries(w).vaddr, forward.vaddr)) + val f0_validTagMatches = sizeMap(w => f0_vtagMatches(w) && f0_validMask(w) && f0_fwdValid) + // f0 select + val f0_fwdMask = shiftMaskToHigh( + forward.vaddr, + Mux1H(f0_validTagMatches, f0_fwdMaskCandidates) + ).asTypeOf(Vec(VDataBytes, Bool())) + val f0_fwdData = shiftDataToHigh( + forward.vaddr, + Mux1H(f0_validTagMatches, f0_fwdDataCandidates) + ).asTypeOf(Vec(VDataBytes, UInt(8.W))) + + // f1 paddr match + val f1_fwdMask = RegEnable(f0_fwdMask, f0_fwdValid) + val f1_fwdData = RegEnable(f0_fwdData, f0_fwdValid) + // forward.paddr from dtlb, which is far from uncache + val f1_ptagMatches = sizeMap(w => addrMatch(RegEnable(entries(w).addr, f0_fwdValid), RegEnable(forward.paddr, f0_fwdValid))) + f1_tagMismatchVec(i) := sizeMap(w => + RegEnable(f0_vtagMatches(w), f0_fwdValid) =/= f1_ptagMatches(w) && RegEnable(f0_validMask(w), f0_fwdValid) && f1_fwdValid + ).asUInt.orR + when(f1_tagMismatchVec(i)) { + XSDebug("forward tag mismatch: pmatch %x vmatch %x vaddr %x paddr %x\n", + f1_ptagMatches.asUInt, + RegEnable(f0_vtagMatches.asUInt, f0_fwdValid), + RegEnable(forward.vaddr, f0_fwdValid), + RegEnable(forward.paddr, f0_fwdValid) + ) } - } .otherwise { - when (io.lsq.resp.fire) { - enqPtr := enqPtr + 1.U - issPtr := issPtr + 1.U - deqPtr := deqPtr + 1.U + // f1 output + forward.addrInvalid := false.B // addr in ubuffer is always ready + forward.dataInvalid := false.B // data in ubuffer is always ready + forward.matchInvalid := f1_tagMismatchVec(i) // paddr / vaddr cam result does not match + for (j <- 0 until VDataBytes) { + forward.forwardMaskFast(j) := f0_fwdMask(j) + + forward.forwardData(j) := f1_fwdData(j) + forward.forwardMask(j) := false.B + when(f1_fwdMask(j) && f1_fwdValid) { + forward.forwardMask(j) := true.B + } } + } - TLArbiter.lowestFromSeq(edge, mem_acquire, entries.map(_.io.mem_acquire)) - val invalid_entries = PopCount(entries.map(_.io.invalid)) - io.flush.empty := invalid_entries === UncacheBufferSize.U - println(s"Uncahe Buffer Size: $UncacheBufferSize entries") + /****************************************************************** + * Debug / Performance + ******************************************************************/ + /* Debug Counters */ // print all input/output requests for debug purpose // print req/resp XSDebug(req.fire, "req cmd: %x addr: %x data: %x mask: %x\n", req.bits.cmd, req.bits.addr, req.bits.data, req.bits.mask) XSDebug(resp.fire, "data: %x\n", req.bits.data) - // print tilelink messages when(mem_acquire.valid){ XSDebug("mem_acquire valid, ready=%d ", mem_acquire.ready) @@ -414,15 +491,23 @@ class UncacheImp(outer: Uncache)extends LazyModuleImp(outer) mem_grant.bits.dump } - // Performance Counters - def isStore: Bool = io.lsq.req.bits.cmd === MemoryOpConstants.M_XWR - XSPerfAccumulate("mmio_store", io.lsq.req.fire && isStore) - XSPerfAccumulate("mmio_load", io.lsq.req.fire && !isStore) - XSPerfAccumulate("mmio_outstanding", mem_acquire.fire && (deqPtr =/= issPtr)) + /* Performance Counters */ + XSPerfAccumulate("uncache_mmio_store", io.lsq.req.fire && isStore(io.lsq.req.bits.cmd) && !io.lsq.req.bits.nc) + XSPerfAccumulate("uncache_mmio_load", io.lsq.req.fire && !isStore(io.lsq.req.bits.cmd) && !io.lsq.req.bits.nc) + XSPerfAccumulate("uncache_nc_store", io.lsq.req.fire && isStore(io.lsq.req.bits.cmd) && io.lsq.req.bits.nc) + XSPerfAccumulate("uncache_nc_load", io.lsq.req.fire && !isStore(io.lsq.req.bits.cmd) && io.lsq.req.bits.nc) + XSPerfAccumulate("uncache_outstanding", uState =/= s_refill_req && mem_acquire.fire) + XSPerfAccumulate("forward_count", PopCount(io.forward.map(_.forwardMask.asUInt.orR))) + XSPerfAccumulate("forward_vaddr_match_failed", PopCount(f1_tagMismatchVec)) + val perfEvents = Seq( - ("mmio_store", io.lsq.req.fire && isStore), - ("mmio_load", io.lsq.req.fire && !isStore), - ("mmio_outstanding", mem_acquire.fire && (deqPtr =/= issPtr)) + ("uncache_mmio_store", io.lsq.req.fire && isStore(io.lsq.req.bits.cmd) && !io.lsq.req.bits.nc), + ("uncache_mmio_load", io.lsq.req.fire && !isStore(io.lsq.req.bits.cmd) && !io.lsq.req.bits.nc), + ("uncache_nc_store", io.lsq.req.fire && isStore(io.lsq.req.bits.cmd) && io.lsq.req.bits.nc), + ("uncache_nc_load", io.lsq.req.fire && !isStore(io.lsq.req.bits.cmd) && io.lsq.req.bits.nc), + ("uncache_outstanding", uState =/= s_refill_req && mem_acquire.fire), + ("forward_count", PopCount(io.forward.map(_.forwardMask.asUInt.orR))), + ("forward_vaddr_match_failed", PopCount(f1_tagMismatchVec)) ) generatePerfEvent() diff --git a/src/main/scala/xiangshan/cache/mmu/MMUBundle.scala b/src/main/scala/xiangshan/cache/mmu/MMUBundle.scala index 0fdb667cec..073e8f50e8 100644 --- a/src/main/scala/xiangshan/cache/mmu/MMUBundle.scala +++ b/src/main/scala/xiangshan/cache/mmu/MMUBundle.scala @@ -384,6 +384,9 @@ object Pbmt { def apply() = UInt(2.W) def isUncache(a: UInt) = a===nc || a===io + def isPMA(a: UInt) = a===pma + def isNC(a: UInt) = a===nc + def isIO(a: UInt) = a===io } class TlbStorageIO(nSets: Int, nWays: Int, ports: Int, nDups: Int = 1)(implicit p: Parameters) extends MMUIOBaseBundle { diff --git a/src/main/scala/xiangshan/mem/MemCommon.scala b/src/main/scala/xiangshan/mem/MemCommon.scala index 556d502290..600887672d 100644 --- a/src/main/scala/xiangshan/mem/MemCommon.scala +++ b/src/main/scala/xiangshan/mem/MemCommon.scala @@ -66,13 +66,23 @@ object genWdata { } object shiftDataToLow { - def apply(addr: UInt,data : UInt): UInt = { - Mux(addr(3), (data >> 64).asUInt,data) + def apply(addr: UInt, data : UInt): UInt = { + Mux(addr(3), (data >> 64).asUInt, data) } } object shiftMaskToLow { - def apply(addr: UInt,mask: UInt): UInt = { - Mux(addr(3),(mask >> 8).asUInt,mask) + def apply(addr: UInt, mask: UInt): UInt = { + Mux(addr(3), (mask >> 8).asUInt, mask) + } +} +object shiftDataToHigh { + def apply(addr: UInt, data : UInt): UInt = { + Mux(addr(3), (data << 64).asUInt, data) + } +} +object shiftMaskToHigh { + def apply(addr: UInt, mask: UInt): UInt = { + Mux(addr(3), (mask << 8).asUInt, mask) } } @@ -97,6 +107,7 @@ class LsPipelineBundle(implicit p: Parameters) extends XSBundle val tlbMiss = Bool() val ptwBack = Bool() val af = Bool() + val nc = Bool() val mmio = Bool() val atomic = Bool() @@ -182,6 +193,7 @@ class LdPrefetchTrainBundle(implicit p: Parameters) extends LsPipelineBundle { if (latch) tlbMiss := RegEnable(input.tlbMiss, enable) else tlbMiss := input.tlbMiss if (latch) ptwBack := RegEnable(input.ptwBack, enable) else ptwBack := input.ptwBack if (latch) af := RegEnable(input.af, enable) else af := input.af + if (latch) nc := RegEnable(input.nc, enable) else nc := input.nc if (latch) mmio := RegEnable(input.mmio, enable) else mmio := input.mmio if (latch) forwardMask := RegEnable(input.forwardMask, enable) else forwardMask := input.forwardMask if (latch) forwardData := RegEnable(input.forwardData, enable) else forwardData := input.forwardData @@ -369,6 +381,8 @@ class LoadNukeQueryReq(implicit p: Parameters) extends XSBundle { // provide lqI val paddr = UInt(PAddrBits.W) // dataInvalid: load data is invalid. val data_valid = Bool() + // nc: is NC access + val is_nc = Bool() } class LoadNukeQueryResp(implicit p: Parameters) extends XSBundle { diff --git a/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala b/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala index b9f59b29d5..fa44460947 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala @@ -86,13 +86,14 @@ class LsqWrapper(implicit p: Parameters) extends XSModule with HasDCacheParamete } val ldout = Vec(LoadPipelineWidth, DecoupledIO(new MemExuOutput)) val ld_raw_data = Vec(LoadPipelineWidth, Output(new LoadDataFromLQBundle)) + val ncOut = Vec(LoadPipelineWidth, DecoupledIO(new LsPipelineBundle)) val replay = Vec(LoadPipelineWidth, Decoupled(new LsPipelineBundle)) val sbuffer = Vec(EnsbufferWidth, Decoupled(new DCacheWordReqWithVaddrAndPfFlag)) val sbufferVecDifftestInfo = Vec(EnsbufferWidth, Decoupled(new DynInst)) // The vector store difftest needs is val forward = Vec(LoadPipelineWidth, Flipped(new PipeLoadForwardQueryIO)) val rob = Flipped(new RobLsqIO) val nuke_rollback = Vec(StorePipelineWidth, Output(Valid(new Redirect))) - val nack_rollback = Output(Valid(new Redirect)) + val nack_rollback = Vec(1, Output(Valid(new Redirect))) // uncahce val release = Flipped(Valid(new Release)) // val refill = Flipped(Valid(new Refill)) val tl_d_channel = Input(new DcacheToLduForwardIO) @@ -198,6 +199,7 @@ class LsqWrapper(implicit p: Parameters) extends XSModule with HasDCacheParamete loadQueue.io.ldu <> io.ldu loadQueue.io.ldout <> io.ldout loadQueue.io.ld_raw_data <> io.ld_raw_data + loadQueue.io.ncOut <> io.ncOut loadQueue.io.rob <> io.rob loadQueue.io.nuke_rollback <> io.nuke_rollback loadQueue.io.nack_rollback <> io.nack_rollback @@ -245,8 +247,10 @@ class LsqWrapper(implicit p: Parameters) extends XSModule with HasDCacheParamete switch(pendingstate){ is(s_idle){ when(io.uncache.req.fire){ - pendingstate := Mux(loadQueue.io.uncache.req.valid, s_load, - Mux(io.uncacheOutstanding, s_idle, s_store)) + pendingstate := + Mux(io.uncacheOutstanding && io.uncache.req.bits.nc, s_idle, + Mux(loadQueue.io.uncache.req.valid, s_load, + s_store)) } } is(s_load){ @@ -277,19 +281,14 @@ class LsqWrapper(implicit p: Parameters) extends XSModule with HasDCacheParamete io.uncache.req.valid := false.B io.uncache.req.bits := DontCare } - when (io.uncacheOutstanding) { + when (io.uncache.resp.bits.is2lq) { io.uncache.resp <> loadQueue.io.uncache.resp } .otherwise { - when(pendingstate === s_load){ - io.uncache.resp <> loadQueue.io.uncache.resp - }.otherwise{ - io.uncache.resp <> storeQueue.io.uncache.resp - } + io.uncache.resp <> storeQueue.io.uncache.resp } loadQueue.io.debugTopDown <> io.debugTopDown - assert(!(loadQueue.io.uncache.req.valid && storeQueue.io.uncache.req.valid)) assert(!(loadQueue.io.uncache.resp.valid && storeQueue.io.uncache.resp.valid)) when (!io.uncacheOutstanding) { assert(!((loadQueue.io.uncache.resp.valid || storeQueue.io.uncache.resp.valid) && pendingstate === s_idle)) diff --git a/src/main/scala/xiangshan/mem/lsqueue/LoadMisalignBuffer.scala b/src/main/scala/xiangshan/mem/lsqueue/LoadMisalignBuffer.scala index ea0ecbea4f..326b1db75c 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/LoadMisalignBuffer.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/LoadMisalignBuffer.scala @@ -565,6 +565,8 @@ class LoadMisalignBuffer(implicit p: Parameters) extends XSModule io.writeBack.bits.data := combinedData io.writeBack.bits.isFromLoadUnit := DontCare io.writeBack.bits.debug.isMMIO := globalMMIO + // FIXME lyq: temporarily set to false + io.writeBack.bits.debug.isNC := false.B io.writeBack.bits.debug.isPerfCnt := false.B io.writeBack.bits.debug.paddr := req.paddr io.writeBack.bits.debug.vaddr := req.vaddr diff --git a/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala b/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala index 31ecc8af5b..e42dcd5efc 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/LoadQueue.scala @@ -181,12 +181,13 @@ class LoadQueue(implicit p: Parameters) extends XSModule } val ldout = Vec(LoadPipelineWidth, DecoupledIO(new MemExuOutput)) val ld_raw_data = Vec(LoadPipelineWidth, Output(new LoadDataFromLQBundle)) + val ncOut = Vec(LoadPipelineWidth, DecoupledIO(new LsPipelineBundle)) val replay = Vec(LoadPipelineWidth, Decoupled(new LsPipelineBundle)) // val refill = Flipped(ValidIO(new Refill)) val tl_d_channel = Input(new DcacheToLduForwardIO) val release = Flipped(Valid(new Release)) val nuke_rollback = Vec(StorePipelineWidth, Output(Valid(new Redirect))) - val nack_rollback = Output(Valid(new Redirect)) + val nack_rollback = Vec(1, Output(Valid(new Redirect))) // uncachebuffer val rob = Flipped(new RobLsqIO) val uncache = new UncacheWordIO val exceptionAddr = new ExceptionAddrIO @@ -210,7 +211,7 @@ class LoadQueue(implicit p: Parameters) extends XSModule val loadQueueReplay = Module(new LoadQueueReplay) // enqueue if need replay val virtualLoadQueue = Module(new VirtualLoadQueue) // control state val exceptionBuffer = Module(new LqExceptionBuffer) // exception buffer - val uncacheBuffer = Module(new UncacheBuffer) // uncache buffer + val uncacheBuffer = Module(new LoadQueueUncache) // uncache /** * LoadQueueRAR */ @@ -274,8 +275,8 @@ class LoadQueue(implicit p: Parameters) extends XSModule exceptionBuffer.io.req(LoadPipelineWidth + i).bits.uop.exceptionVec := io.vecFeedback(i).bits.exceptionVec } // mmio non-data error exception - exceptionBuffer.io.req.last := uncacheBuffer.io.exception - exceptionBuffer.io.req.last.bits.vaNeedExt := true.B + exceptionBuffer.io.req(LoadPipelineWidth + VecLoadPipelineWidth) := uncacheBuffer.io.exception + exceptionBuffer.io.req(LoadPipelineWidth + VecLoadPipelineWidth).bits.vaNeedExt := true.B exceptionBuffer.io.flushFrmMaBuf := io.flushFrmMaBuf io.exceptionAddr <> exceptionBuffer.io.exceptionAddr @@ -283,19 +284,24 @@ class LoadQueue(implicit p: Parameters) extends XSModule /** * Load uncache buffer */ - uncacheBuffer.io.redirect <> io.redirect - uncacheBuffer.io.ldout <> io.ldout - uncacheBuffer.io.ld_raw_data <> io.ld_raw_data - uncacheBuffer.io.rob <> io.rob - uncacheBuffer.io.uncache <> io.uncache + uncacheBuffer.io.redirect <> io.redirect + uncacheBuffer.io.mmioOut <> io.ldout + uncacheBuffer.io.ncOut <> io.ncOut + uncacheBuffer.io.mmioRawData <> io.ld_raw_data + uncacheBuffer.io.rob <> io.rob + uncacheBuffer.io.uncache <> io.uncache + for ((buff, w) <- uncacheBuffer.io.req.zipWithIndex) { - buff.valid := io.ldu.ldin(w).valid // from load_s3 - buff.bits := io.ldu.ldin(w).bits // from load_s3 + // from load_s3 + val ldinBits = io.ldu.ldin(w).bits + buff.valid := io.ldu.ldin(w).valid && (ldinBits.nc || ldinBits.mmio) && !ldinBits.rep_info.need_rep + buff.bits := ldinBits } + io.uncache.resp.ready := true.B io.nuke_rollback := loadQueueRAW.io.rollback - io.nack_rollback := uncacheBuffer.io.rollback + io.nack_rollback(0) := uncacheBuffer.io.rollback /* <------- DANGEROUS: Don't change sequence here ! -------> */ @@ -336,7 +342,7 @@ class LoadQueue(implicit p: Parameters) extends XSModule XSPerfAccumulate("full_mask_110", full_mask === 6.U) XSPerfAccumulate("full_mask_111", full_mask === 7.U) XSPerfAccumulate("nuke_rollback", io.nuke_rollback.map(_.valid).reduce(_ || _).asUInt) - XSPerfAccumulate("nack_rollabck", io.nack_rollback.valid) + XSPerfAccumulate("nack_rollabck", io.nack_rollback.map(_.valid).reduce(_ || _).asUInt) // perf cnt val perfEvents = Seq(virtualLoadQueue, loadQueueRAR, loadQueueRAW, loadQueueReplay).flatMap(_.getPerfEvents) ++ @@ -350,7 +356,7 @@ class LoadQueue(implicit p: Parameters) extends XSModule ("full_mask_110", full_mask === 6.U), ("full_mask_111", full_mask === 7.U), ("nuke_rollback", io.nuke_rollback.map(_.valid).reduce(_ || _).asUInt), - ("nack_rollback", io.nack_rollback.valid) + ("nack_rollback", io.nack_rollback.map(_.valid).reduce(_ || _).asUInt) ) generatePerfEvent() // end diff --git a/src/main/scala/xiangshan/mem/lsqueue/LoadQueueRAR.scala b/src/main/scala/xiangshan/mem/lsqueue/LoadQueueRAR.scala index c058380946..b8b07b7562 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/LoadQueueRAR.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/LoadQueueRAR.scala @@ -60,7 +60,6 @@ class LoadQueueRAR(implicit p: Parameters) extends XSModule // MicroOp : Micro-op // PAddr : physical address. // Released : DCache released. - // val allocated = RegInit(VecInit(List.fill(LoadQueueRARSize)(false.B))) // The control signals need to explicitly indicate the initial value val uop = Reg(Vec(LoadQueueRARSize, new DynInst)) val paddrModule = Module(new LqPAddrModule( @@ -143,12 +142,15 @@ class LoadQueueRAR(implicit p: Parameters) extends XSModule // Fill info uop(enqIndex) := enq.bits.uop - released(enqIndex) := + // NC is uncachable and will not be explicitly released. + // So NC requests are not allowed to have RAR + released(enqIndex) := enq.bits.is_nc || ( enq.bits.data_valid && (release2Cycle.valid && enq.bits.paddr(PAddrBits-1, DCacheLineOffset) === release2Cycle.bits.paddr(PAddrBits-1, DCacheLineOffset) || release1Cycle.valid && enq.bits.paddr(PAddrBits-1, DCacheLineOffset) === release1Cycle.bits.paddr(PAddrBits-1, DCacheLineOffset)) + ) } } @@ -196,7 +198,7 @@ class LoadQueueRAR(implicit p: Parameters) extends XSModule // LoadQueueRAR Query // Load-to-Load violation check condition: // 1. Physical address match by CAM port. - // 2. release is set. + // 2. release or nc_with_data is set. // 3. Younger than current load instruction. val ldLdViolation = Wire(Vec(LoadPipelineWidth, Bool())) //val allocatedUInt = RegNext(allocated.asUInt) diff --git a/src/main/scala/xiangshan/mem/lsqueue/LoadQueueUncache.scala b/src/main/scala/xiangshan/mem/lsqueue/LoadQueueUncache.scala new file mode 100644 index 0000000000..0378c59d1d --- /dev/null +++ b/src/main/scala/xiangshan/mem/lsqueue/LoadQueueUncache.scala @@ -0,0 +1,597 @@ +/*************************************************************************************** + * Copyright (c) 2024 Beijing Institute of Open Source Chip (BOSC) + * Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences + * Copyright (c) 2020-2021 Peng Cheng Laboratory + * + * XiangShan is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * + * See the Mulan PSL v2 for more details. + ***************************************************************************************/ +package xiangshan.mem + +import chisel3._ +import chisel3.util._ +import org.chipsalliance.cde.config._ +import xiangshan._ +import xiangshan.backend.rob.{RobPtr, RobLsqIO} +import xiangshan.ExceptionNO._ +import xiangshan.cache._ +import utils._ +import utility._ +import xiangshan.backend.Bundles +import xiangshan.backend.Bundles.{DynInst, MemExuOutput} +import xiangshan.backend.fu.FuConfig.LduCfg +import xiangshan.backend.HasMemBlockParameters + +class UncacheEntry(entryIndex: Int)(implicit p: Parameters) extends XSModule + with HasCircularQueuePtrHelper + with HasLoadHelper +{ + val io = IO(new Bundle() { + /* control */ + val redirect = Flipped(Valid(new Redirect)) + // redirect flush + val flush = Output(Bool()) + // mmio commit + val rob = Flipped(new RobLsqIO) + // mmio select + val mmioSelect = Output(Bool()) + + /* transaction */ + // from ldu + val req = Flipped(Valid(new LqWriteBundle)) + // to ldu: mmio, data + val mmioOut = DecoupledIO(new MemExuOutput) + val mmioRawData = Output(new LoadDataFromLQBundle) + // to ldu: nc with data + val ncOut = DecoupledIO(new LsPipelineBundle) + // <=> uncache + val uncache = new UncacheWordIO + // exception generated by outer bus + val exception = Valid(new LqWriteBundle) + }) + + val req_valid = RegInit(false.B) + val isNC = RegInit(false.B) + val req = Reg(new LqWriteBundle) + + val s_idle :: s_req :: s_resp :: s_wait :: Nil = Enum(4) + val uncacheState = RegInit(s_idle) + val uncacheData = Reg(io.uncache.resp.bits.data.cloneType) + val nderr = RegInit(false.B) + + val writeback = Mux(req.nc, io.ncOut.fire, io.mmioOut.fire) + + /** + * Flush + * + * 1. direct flush during idle + * 2. otherwise delayed flush until receiving uncache resp + */ + val needFlushReg = RegInit(false.B) + val needFlush = req_valid && req.uop.robIdx.needFlush(io.redirect) + val flush = (needFlush && uncacheState===s_idle) || (io.uncache.resp.fire && needFlushReg) + when(flush){ + needFlushReg := false.B + }.elsewhen(needFlush){ + needFlushReg := true.B + } + + /* enter req */ + when (flush) { + req_valid := false.B + } .elsewhen (io.req.valid) { + XSError(req_valid, p"LoadQueueUncache: You can not write an valid entry: $entryIndex") + req_valid := true.B + req := io.req.bits + nderr := false.B + } .elsewhen (writeback) { + req_valid := false.B + } + + /** + * Memory mapped IO / NC operations + * + * States: + * (1) s_idle: wait for mmio reaching ROB's head / nc req valid from loadunit + * (2) s_req: wait to be sent to uncache channel until req selected and uncache ready + * (3) s_resp: wait for response from uncache channel + * (4) s_wait: wait for loadunit to receive writeback req + */ + val pendingld = GatedValidRegNext(io.rob.pendingMMIOld) + val pendingPtr = GatedRegNext(io.rob.pendingPtr) + val canSendReq = req_valid && !needFlush && Mux( + req.nc, true.B, + pendingld && req.uop.robIdx === pendingPtr + ) + switch (uncacheState) { + is (s_idle) { + when (canSendReq) { + uncacheState := s_req + } + } + is (s_req) { + when (io.uncache.req.fire) { + uncacheState := s_resp + } + } + is (s_resp) { + when (io.uncache.resp.fire) { + when (needFlushReg) { + uncacheState := s_idle + }.otherwise{ + uncacheState := s_wait + } + } + } + is (s_wait) { + when (writeback) { + uncacheState := s_idle + } + } + } + + /* control */ + io.flush := flush + io.rob.mmio := DontCare + io.rob.uop := DontCare + io.mmioSelect := (uncacheState =/= s_idle) && req.mmio + + /* uncahce req */ + io.uncache.req.valid := uncacheState === s_req + io.uncache.req.bits := DontCare + io.uncache.req.bits.cmd := MemoryOpConstants.M_XRD + io.uncache.req.bits.data := DontCare + io.uncache.req.bits.addr := req.paddr + io.uncache.req.bits.vaddr:= req.vaddr + io.uncache.req.bits.mask := Mux(req.paddr(3), req.mask(15, 8), req.mask(7, 0)) + io.uncache.req.bits.id := entryIndex.U + io.uncache.req.bits.instrtype := DontCare + io.uncache.req.bits.replayCarry := DontCare + io.uncache.req.bits.atomic := req.atomic + io.uncache.req.bits.nc := req.nc + + io.uncache.resp.ready := true.B + + /* uncahce resp */ + when (io.uncache.resp.fire) { + uncacheData := io.uncache.resp.bits.data + nderr := io.uncache.resp.bits.nderr + } + + /* uncahce writeback */ + val selUop = req.uop + val func = selUop.fuOpType + val raddr = req.paddr + val rdataSel = LookupTree(raddr(2, 0), List( + "b000".U -> uncacheData(63, 0), + "b001".U -> uncacheData(63, 8), + "b010".U -> uncacheData(63, 16), + "b011".U -> uncacheData(63, 24), + "b100".U -> uncacheData(63, 32), + "b101".U -> uncacheData(63, 40), + "b110".U -> uncacheData(63, 48), + "b111".U -> uncacheData(63, 56) + )) + val rdataPartialLoad = rdataHelper(selUop, rdataSel) + + io.mmioOut.valid := false.B + io.mmioOut.bits := DontCare + io.mmioRawData := DontCare + io.ncOut.valid := false.B + io.ncOut.bits := DontCare + + when(req.nc){ + io.ncOut.valid := (uncacheState === s_wait) + io.ncOut.bits := DontCare + io.ncOut.bits.uop := selUop + io.ncOut.bits.uop.lqIdx := req.uop.lqIdx + io.ncOut.bits.uop.exceptionVec(loadAccessFault) := nderr + io.ncOut.bits.data := rdataPartialLoad + io.ncOut.bits.paddr := req.paddr + io.ncOut.bits.vaddr := req.vaddr + io.ncOut.bits.nc := true.B + io.ncOut.bits.mask := Mux(req.paddr(3), req.mask(15, 8), req.mask(7, 0)) + io.ncOut.bits.schedIndex := req.schedIndex + io.ncOut.bits.isvec := req.isvec + io.ncOut.bits.is128bit := req.is128bit + io.ncOut.bits.vecActive := req.vecActive + }.otherwise{ + io.mmioOut.valid := (uncacheState === s_wait) + io.mmioOut.bits := DontCare + io.mmioOut.bits.uop := selUop + io.mmioOut.bits.uop.lqIdx := req.uop.lqIdx + io.mmioOut.bits.uop.exceptionVec(loadAccessFault) := nderr + io.mmioOut.bits.data := rdataPartialLoad + io.mmioOut.bits.debug.isMMIO := true.B + io.mmioOut.bits.debug.isNC := false.B + io.mmioOut.bits.debug.paddr := req.paddr + io.mmioOut.bits.debug.vaddr := req.vaddr + io.mmioRawData.lqData := uncacheData + io.mmioRawData.uop := req.uop + io.mmioRawData.addrOffset := req.paddr + } + + io.exception.valid := writeback + io.exception.bits := req + io.exception.bits.uop.exceptionVec(loadAccessFault) := nderr + + /* debug log */ + when (io.uncache.req.fire) { + XSDebug("uncache req: pc %x addr %x data %x op %x mask %x\n", + req.uop.pc, + io.uncache.req.bits.addr, + io.uncache.req.bits.data, + io.uncache.req.bits.cmd, + io.uncache.req.bits.mask + ) + } + when(io.ncOut.fire) { + XSInfo("int load miss write to cbd robidx %d lqidx %d pc 0x%x mmio %x\n", + io.ncOut.bits.uop.robIdx.asUInt, + io.ncOut.bits.uop.lqIdx.asUInt, + io.ncOut.bits.uop.pc, + true.B + ) + } + when(io.mmioOut.fire) { + XSInfo("int load miss write to cbd robidx %d lqidx %d pc 0x%x mmio %x\n", + io.mmioOut.bits.uop.robIdx.asUInt, + io.mmioOut.bits.uop.lqIdx.asUInt, + io.mmioOut.bits.uop.pc, + true.B + ) + } + +} + +class LoadQueueUncache(implicit p: Parameters) extends XSModule + with HasCircularQueuePtrHelper + with HasMemBlockParameters +{ + val io = IO(new Bundle() { + /* control */ + val redirect = Flipped(Valid(new Redirect)) + // mmio commit + val rob = Flipped(new RobLsqIO) + + /* transaction */ + // enqueue: from ldu s3 + val req = Vec(LoadPipelineWidth, Flipped(Decoupled(new LqWriteBundle))) + // writeback: mmio to ldu s0, s3 + val mmioOut = Vec(LoadPipelineWidth, DecoupledIO(new MemExuOutput)) + val mmioRawData = Vec(LoadPipelineWidth, Output(new LoadDataFromLQBundle)) + // writeback: nc to ldu s0--s3 + val ncOut = Vec(LoadPipelineWidth, Decoupled(new LsPipelineBundle)) + // <=>uncache + val uncache = new UncacheWordIO + + /* except */ + // rollback from frontend when buffer is full + val rollback = Output(Valid(new Redirect)) + // exception generated by outer bus + val exception = Valid(new LqWriteBundle) + }) + + /****************************************************************** + * Structure + ******************************************************************/ + val entries = Seq.tabulate(LoadUncacheBufferSize)(i => Module(new UncacheEntry(i))) + + val freeList = Module(new FreeList( + size = LoadUncacheBufferSize, + allocWidth = LoadPipelineWidth, + freeWidth = 4, + enablePreAlloc = true, + moduleName = "LoadQueueUncache freelist" + )) + freeList.io := DontCare + + // set default IO + entries.foreach { + case (e) => + e.io.req.valid := false.B + e.io.req.bits := DontCare + e.io.uncache.req.ready := false.B + e.io.uncache.resp.valid := false.B + e.io.uncache.resp.bits := DontCare + e.io.ncOut.ready := false.B + e.io.mmioOut.ready := false.B + } + io.uncache.req.valid := false.B + io.uncache.req.bits := DontCare + io.uncache.resp.ready := false.B + for (w <- 0 until LoadPipelineWidth) { + io.mmioOut(w).valid := false.B + io.mmioOut(w).bits := DontCare + io.mmioRawData(w) := DontCare + io.ncOut(w).valid := false.B + io.ncOut(w).bits := DontCare + } + + + /****************************************************************** + * Enqueue + * + * s1: hold + * s2: confirm enqueue and write entry + * valid: no redirect, no exception, no replay, is mmio/nc + * ready: freelist can allocate + ******************************************************************/ + + val s1_req = VecInit(io.req.map(_.bits)) + val s1_valid = VecInit(io.req.map(_.valid)) + val s2_enqueue = Wire(Vec(LoadPipelineWidth, Bool())) + io.req.zipWithIndex.foreach{ case (r, i) => + r.ready := !s2_enqueue(i) || freeList.io.canAllocate(i) + } + + // s2: enqueue + val s2_req = (0 until LoadPipelineWidth).map(i => {RegEnable(s1_req(i), s1_valid(i))}) + val s2_valid = (0 until LoadPipelineWidth).map(i => { + RegNext(s1_valid(i)) && + !s2_req(i).uop.robIdx.needFlush(RegNext(io.redirect)) && + !s2_req(i).uop.robIdx.needFlush(io.redirect) + }) + val s2_has_exception = s2_req.map(x => ExceptionNO.selectByFu(x.uop.exceptionVec, LduCfg).asUInt.orR) + val s2_need_replay = s2_req.map(_.rep_info.need_rep) + + for (w <- 0 until LoadPipelineWidth) { + s2_enqueue(w) := s2_valid(w) && !s2_has_exception(w) && !s2_need_replay(w) && (s2_req(w).mmio || s2_req(w).nc) + } + + val s2_enqValidVec = Wire(Vec(LoadPipelineWidth, Bool())) + val s2_enqIndexVec = Wire(Vec(LoadPipelineWidth, UInt())) + + for (w <- 0 until LoadPipelineWidth) { + freeList.io.allocateReq(w) := true.B + } + + // freeList real-allocate + for (w <- 0 until LoadPipelineWidth) { + freeList.io.doAllocate(w) := s2_enqValidVec(w) + } + + for (w <- 0 until LoadPipelineWidth) { + s2_enqValidVec(w) := s2_enqueue(w) && freeList.io.canAllocate(w) + + val offset = PopCount(s2_enqueue.take(w)) + s2_enqIndexVec(w) := freeList.io.allocateSlot(offset) + } + + + /****************************************************************** + * Uncache Transaction + * + * 1. uncache req + * 2. uncache resp + * 3. writeback + ******************************************************************/ + private val NC_WB_MOD = NCWBPorts.length + + val uncacheReq = Wire(DecoupledIO(io.uncache.req.bits.cloneType)) + val mmioSelect = entries.map(e => e.io.mmioSelect).reduce(_ || _) + val mmioReq = Wire(DecoupledIO(io.uncache.req.bits.cloneType)) + // TODO lyq: It's best to choose in robIdx order / the order in which they enter + val ncReqArb = Module(new RRArbiterInit(io.uncache.req.bits.cloneType, LoadUncacheBufferSize)) + + val mmioOut = Wire(DecoupledIO(io.mmioOut(0).bits.cloneType)) + val mmioRawData = Wire(io.mmioRawData(0).cloneType) + val ncOut = Wire(chiselTypeOf(io.ncOut)) + val ncOutValidVec = VecInit(entries.map(e => e.io.ncOut.valid)) + val ncOutValidVecRem = SubVec.getMaskRem(ncOutValidVec, NC_WB_MOD) + + // init + uncacheReq.valid := false.B + uncacheReq.bits := DontCare + mmioReq.valid := false.B + mmioReq.bits := DontCare + mmioOut.valid := false.B + mmioOut.bits := DontCare + mmioRawData := DontCare + for (i <- 0 until LoadUncacheBufferSize) { + ncReqArb.io.in(i).valid := false.B + ncReqArb.io.in(i).bits := DontCare + } + for (i <- 0 until LoadPipelineWidth) { + ncOut(i).valid := false.B + ncOut(i).bits := DontCare + } + + entries.zipWithIndex.foreach { + case (e, i) => + // enqueue + for (w <- 0 until LoadPipelineWidth) { + when (s2_enqValidVec(w) && (i.U === s2_enqIndexVec(w))) { + e.io.req.valid := true.B + e.io.req.bits := s2_req(w) + } + } + + // control + e.io.redirect <> io.redirect + e.io.rob <> io.rob + + // uncache req, writeback + when (e.io.mmioSelect) { + mmioReq.valid := e.io.uncache.req.valid + mmioReq.bits := e.io.uncache.req.bits + e.io.uncache.req.ready := mmioReq.ready + + e.io.mmioOut.ready := mmioOut.ready + mmioOut.valid := e.io.mmioOut.valid + mmioOut.bits := e.io.mmioOut.bits + mmioRawData := e.io.mmioRawData + + }.otherwise{ + ncReqArb.io.in(i).valid := e.io.uncache.req.valid + ncReqArb.io.in(i).bits := e.io.uncache.req.bits + e.io.uncache.req.ready := ncReqArb.io.in(i).ready + + (0 until NC_WB_MOD).map { w => + val (idx, ncOutValid) = PriorityEncoderWithFlag(ncOutValidVecRem(w)) + val port = NCWBPorts(w) + when((i.U === idx) && ncOutValid) { + ncOut(port).valid := ncOutValid + ncOut(port).bits := e.io.ncOut.bits + e.io.ncOut.ready := ncOut(port).ready + } + } + + } + + // uncache resp + when (i.U === io.uncache.resp.bits.id) { + e.io.uncache.resp <> io.uncache.resp + } + + } + + mmioReq.ready := false.B + ncReqArb.io.out.ready := false.B + when(mmioSelect){ + uncacheReq <> mmioReq + }.otherwise{ + uncacheReq <> ncReqArb.io.out + } + + // uncache Request + AddPipelineReg(uncacheReq, io.uncache.req, false.B) + + // uncache Writeback + AddPipelineReg(mmioOut, io.mmioOut(UncacheWBPort), false.B) + io.mmioRawData(UncacheWBPort) := RegEnable(mmioRawData, mmioOut.fire) + + (0 until LoadPipelineWidth).foreach { i => AddPipelineReg(ncOut(i), io.ncOut(i), false.B) } + + // uncache exception + io.exception.valid := Cat(entries.map(_.io.exception.valid)).orR + io.exception.bits := ParallelPriorityMux(entries.map(e => + (e.io.exception.valid, e.io.exception.bits) + )) + + // rob + for (i <- 0 until LoadPipelineWidth) { + io.rob.mmio(i) := RegNext(s1_valid(i) && s1_req(i).mmio) + io.rob.uop(i) := RegEnable(s1_req(i).uop, s1_valid(i)) + } + + + /****************************************************************** + * Deallocate + ******************************************************************/ + // UncacheBuffer deallocate + val freeMaskVec = Wire(Vec(LoadUncacheBufferSize, Bool())) + + // init + freeMaskVec.map(e => e := false.B) + + // dealloc logic + entries.zipWithIndex.foreach { + case (e, i) => + when ((e.io.mmioSelect && e.io.mmioOut.fire) || e.io.ncOut.fire || e.io.flush) { + freeMaskVec(i) := true.B + } + } + + freeList.io.free := freeMaskVec.asUInt + + + /****************************************************************** + * Uncache rollback detection + * + * When uncache loads enqueue, it searches uncache loads, They can not enqueue and need re-execution. + * + * Cycle 0: uncache enqueue. + * Cycle 1: Select oldest uncache loads. + * Cycle 2: Redirect Fire. + * Choose the oldest load from LoadPipelineWidth oldest loads. + * Prepare redirect request according to the detected rejection. + * Fire redirect request (if valid) + * + * Load_S3 .... Load_S3 + * stage 0: lq lq + * | | (can not enqueue) + * stage 1: lq lq + * | | + * --------------- + * | + * stage 2: lq + * | + * rollback req + * + ******************************************************************/ + def selectOldestRedirect(xs: Seq[Valid[Redirect]]): Vec[Bool] = { + val compareVec = (0 until xs.length).map(i => (0 until i).map(j => isAfter(xs(j).bits.robIdx, xs(i).bits.robIdx))) + val resultOnehot = VecInit((0 until xs.length).map(i => Cat((0 until xs.length).map(j => + (if (j < i) !xs(j).valid || compareVec(i)(j) + else if (j == i) xs(i).valid + else !xs(j).valid || !compareVec(j)(i)) + )).andR)) + resultOnehot + } + val reqNeedCheck = VecInit((0 until LoadPipelineWidth).map(w => + s2_enqueue(w) && !s2_enqValidVec(w) + )) + val reqSelUops = VecInit(s2_req.map(_.uop)) + val allRedirect = (0 until LoadPipelineWidth).map(i => { + val redirect = Wire(Valid(new Redirect)) + redirect.valid := reqNeedCheck(i) + redirect.bits := DontCare + redirect.bits.isRVC := reqSelUops(i).preDecodeInfo.isRVC + redirect.bits.robIdx := reqSelUops(i).robIdx + redirect.bits.ftqIdx := reqSelUops(i).ftqPtr + redirect.bits.ftqOffset := reqSelUops(i).ftqOffset + redirect.bits.level := RedirectLevel.flush + redirect.bits.cfiUpdate.target := reqSelUops(i).pc // TODO: check if need pc + redirect.bits.debug_runahead_checkpoint_id := reqSelUops(i).debugInfo.runahead_checkpoint_id + redirect + }) + val oldestOneHot = selectOldestRedirect(allRedirect) + val oldestRedirect = Mux1H(oldestOneHot, allRedirect) + val lastCycleRedirect = Wire(Valid(new Redirect)) + lastCycleRedirect.valid := RegNext(io.redirect.valid) + lastCycleRedirect.bits := RegEnable(io.redirect.bits, io.redirect.valid) + val lastLastCycleRedirect = Wire(Valid(new Redirect)) + lastLastCycleRedirect.valid := RegNext(lastCycleRedirect.valid) + lastLastCycleRedirect.bits := RegEnable(lastCycleRedirect.bits, lastCycleRedirect.valid) + io.rollback.valid := GatedValidRegNext(oldestRedirect.valid && + !oldestRedirect.bits.robIdx.needFlush(io.redirect) && + !oldestRedirect.bits.robIdx.needFlush(lastCycleRedirect) && + !oldestRedirect.bits.robIdx.needFlush(lastLastCycleRedirect)) + io.rollback.bits := RegEnable(oldestRedirect.bits, oldestRedirect.valid) + + + /****************************************************************** + * Perf Counter + ******************************************************************/ + val validCount = freeList.io.validCount + val allowEnqueue = !freeList.io.empty + QueuePerf(LoadUncacheBufferSize, validCount, !allowEnqueue) + + XSPerfAccumulate("mmio_uncache_req", io.uncache.req.fire && !io.uncache.req.bits.nc) + XSPerfAccumulate("mmio_writeback_success", io.mmioOut(0).fire) + XSPerfAccumulate("mmio_writeback_blocked", io.mmioOut(0).valid && !io.mmioOut(0).ready) + XSPerfAccumulate("nc_uncache_req", io.uncache.req.fire && io.uncache.req.bits.nc) + XSPerfAccumulate("nc_writeback_success", io.ncOut(0).fire) + XSPerfAccumulate("nc_writeback_blocked", io.ncOut(0).valid && !io.ncOut(0).ready) + XSPerfAccumulate("uncache_full_rollback", io.rollback.valid) + + val perfEvents: Seq[(String, UInt)] = Seq( + ("mmio_uncache_req", io.uncache.req.fire && !io.uncache.req.bits.nc), + ("mmio_writeback_success", io.mmioOut(0).fire), + ("mmio_writeback_blocked", io.mmioOut(0).valid && !io.mmioOut(0).ready), + ("nc_uncache_req", io.uncache.req.fire && io.uncache.req.bits.nc), + ("nc_writeback_success", io.ncOut(0).fire), + ("nc_writeback_blocked", io.ncOut(0).valid && !io.ncOut(0).ready), + ("uncache_full_rollback", io.rollback.valid) + ) + // end +} diff --git a/src/main/scala/xiangshan/mem/lsqueue/StoreMisalignBuffer.scala b/src/main/scala/xiangshan/mem/lsqueue/StoreMisalignBuffer.scala index f4585d78c6..fd0be8652a 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/StoreMisalignBuffer.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/StoreMisalignBuffer.scala @@ -578,6 +578,8 @@ class StoreMisalignBuffer(implicit p: Parameters) extends XSModule io.writeBack.bits.data := unalignedStoreData io.writeBack.bits.isFromLoadUnit := DontCare io.writeBack.bits.debug.isMMIO := globalMMIO + // FIXME lyq: temporarily set to false + io.writeBack.bits.debug.isNC := false.B io.writeBack.bits.debug.isPerfCnt := false.B io.writeBack.bits.debug.paddr := req.paddr io.writeBack.bits.debug.vaddr := req.vaddr diff --git a/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala b/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala index f5437650de..9e9a76d8b2 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala @@ -255,12 +255,13 @@ class StoreQueue(implicit p: Parameters) extends XSModule // state & misc val allocated = RegInit(VecInit(List.fill(StoreQueueSize)(false.B))) // sq entry has been allocated - val addrvalid = RegInit(VecInit(List.fill(StoreQueueSize)(false.B))) // non-mmio addr is valid - val datavalid = RegInit(VecInit(List.fill(StoreQueueSize)(false.B))) // non-mmio data is valid - val allvalid = VecInit((0 until StoreQueueSize).map(i => addrvalid(i) && datavalid(i))) // non-mmio data & addr is valid + val addrvalid = RegInit(VecInit(List.fill(StoreQueueSize)(false.B))) + val datavalid = RegInit(VecInit(List.fill(StoreQueueSize)(false.B))) + val allvalid = VecInit((0 until StoreQueueSize).map(i => addrvalid(i) && datavalid(i))) val committed = RegInit(VecInit(List.fill(StoreQueueSize)(false.B))) // inst has been committed by rob val unaligned = RegInit(VecInit(List.fill(StoreQueueSize)(false.B))) // unaligned store val pending = RegInit(VecInit(List.fill(StoreQueueSize)(false.B))) // mmio pending: inst is an mmio inst, it will not be executed until it reachs the end of rob + val nc = RegInit(VecInit(List.fill(StoreQueueSize)(false.B))) // nc: inst is a nc inst val mmio = RegInit(VecInit(List.fill(StoreQueueSize)(false.B))) // mmio: inst is an mmio inst val atomic = RegInit(VecInit(List.fill(StoreQueueSize)(false.B))) val prefetch = RegInit(VecInit(List.fill(StoreQueueSize)(false.B))) // need prefetch when committing this store to sbuffer? @@ -300,6 +301,17 @@ class StoreQueue(implicit p: Parameters) extends XSModule val finishMisalignSt = GatedValidRegNext(doMisalignSt && io.maControl.control.removeSq && !io.maControl.control.hasException) val misalignBlock = doMisalignSt && !finishMisalignSt + val mmioReq = Wire(chiselTypeOf(io.uncache.req)) + val ncReq = Wire(chiselTypeOf(io.uncache.req)) + val ncResp = Wire(chiselTypeOf(io.uncache.resp)) + val ncDoReq = Wire(Bool()) + val ncDoResp = Wire(Bool()) + val ncReadNextTrigger = Mux(io.uncacheOutstanding, ncDoReq, ncDoResp) + // ncDoReq is double RegNexted, as ubuffer data write takes 3 cycles. + // TODO lyq: to eliminate coupling by passing signals through ubuffer + val ncDeqTrigger = Mux(io.uncacheOutstanding, RegNext(RegNext(ncDoReq)), ncDoResp) + val ncPtr = Mux(io.uncacheOutstanding, RegNext(RegNext(io.uncache.req.bits.id)), io.uncache.resp.bits.id) + // store miss align info io.maControl.storeInfo.data := dataModule.io.rdata(0).data io.maControl.storeInfo.dataReady := doMisalignSt @@ -313,13 +325,10 @@ class StoreQueue(implicit p: Parameters) extends XSModule assert(EnsbufferWidth <= 2) // rdataPtrExtNext and rdataPtrExtNext+1 entry will be read from dataModule val rdataPtrExtNext = Wire(Vec(EnsbufferWidth, new SqPtr)) - rdataPtrExtNext := WireInit(Mux(dataBuffer.io.enq(1).fire, - VecInit(rdataPtrExt.map(_ + 2.U)), - Mux(dataBuffer.io.enq(0).fire || io.mmioStout.fire || io.vecmmioStout.fire, - VecInit(rdataPtrExt.map(_ + 1.U)), - rdataPtrExt - ) - )) + rdataPtrExtNext := rdataPtrExt.map(i => i + + PopCount(dataBuffer.io.enq.map(_.fire)) + + PopCount(ncReadNextTrigger || io.mmioStout.fire || io.vecmmioStout.fire) + ) // deqPtrExtNext traces which inst is about to leave store queue // @@ -330,17 +339,16 @@ class StoreQueue(implicit p: Parameters) extends XSModule // // Modify deqPtrExtNext and io.sqDeq with care! val deqPtrExtNext = Wire(Vec(EnsbufferWidth, new SqPtr)) - deqPtrExtNext := Mux(RegNext(io.sbuffer(1).fire), - VecInit(deqPtrExt.map(_ + 2.U)), - Mux((RegNext(io.sbuffer(0).fire)) || io.mmioStout.fire || io.vecmmioStout.fire, - VecInit(deqPtrExt.map(_ + 1.U)), - deqPtrExt - ) + deqPtrExtNext := deqPtrExt.map(i => i + + RegNext(PopCount(VecInit(io.sbuffer.map(_.fire)))) + + PopCount(ncDeqTrigger || io.mmioStout.fire || io.vecmmioStout.fire) + ) + + io.sqDeq := RegNext( + RegNext(PopCount(VecInit(io.sbuffer.map(_.fire && !misalignBlock)))) + + PopCount(ncDeqTrigger || io.mmioStout.fire || io.vecmmioStout.fire || finishMisalignSt) ) - io.sqDeq := RegNext(Mux(RegNext(io.sbuffer(1).fire && !misalignBlock), 2.U, - Mux((RegNext(io.sbuffer(0).fire && !misalignBlock)) || io.mmioStout.fire || io.vecmmioStout.fire || finishMisalignSt, 1.U, 0.U) - )) assert(!RegNext(RegNext(io.sbuffer(0).fire) && (io.mmioStout.fire || io.vecmmioStout.fire))) for (i <- 0 until EnsbufferWidth) { @@ -383,6 +391,7 @@ class StoreQueue(implicit p: Parameters) extends XSModule committed((index + j.U).value) := false.B pending((index + j.U).value) := false.B prefetch((index + j.U).value) := false.B + nc((index + j.U).value) := false.B mmio((index + j.U).value) := false.B isVec((index + j.U).value) := FuType.isVStore(io.enq.req(i).bits.fuType) vecMbCommit((index + j.U).value) := false.B @@ -483,6 +492,7 @@ class StoreQueue(implicit p: Parameters) extends XSModule when (io.storeAddrIn(i).fire) { val addr_valid = !io.storeAddrIn(i).bits.miss addrvalid(stWbIndex) := addr_valid //!io.storeAddrIn(i).bits.mmio + nc(stWbIndex) := io.storeAddrIn(i).bits.nc // pending(stWbIndex) := io.storeAddrIn(i).bits.mmio unaligned(stWbIndex) := io.storeAddrIn(i).bits.uop.exceptionVec(storeAddrMisaligned) && !io.storeAddrIn(i).bits.isvec @@ -785,31 +795,28 @@ class StoreQueue(implicit p: Parameters) extends XSModule //(2) when they reach ROB's head, they can be sent to uncache channel // TODO: CAN NOT deal with vector mmio now! val s_idle :: s_req :: s_resp :: s_wb :: s_wait :: Nil = Enum(5) - val uncacheState = RegInit(s_idle) + val mmioState = RegInit(s_idle) val uncacheUop = Reg(new DynInst) val uncacheVAddr = Reg(UInt(VAddrBits.W)) val cboFlushedSb = RegInit(false.B) val cmoOpCode = uncacheUop.fuOpType(1, 0) - switch(uncacheState) { + val mmioDoReq = io.uncache.req.fire && !io.uncache.req.bits.nc + switch(mmioState) { is(s_idle) { when(RegNext(io.rob.pendingst && uop(deqPtr).robIdx === io.rob.pendingPtr && pending(deqPtr) && allocated(deqPtr) && datavalid(deqPtr) && addrvalid(deqPtr))) { - uncacheState := s_req + mmioState := s_req uncacheUop := uop(deqPtr) cboFlushedSb := false.B } } is(s_req) { - when (io.uncache.req.fire) { - when (io.uncacheOutstanding) { - uncacheState := s_wb - } .otherwise { - uncacheState := s_resp - } + when (mmioDoReq) { + mmioState := s_resp } } is(s_resp) { - when(io.uncache.resp.fire) { - uncacheState := s_wb + when(io.uncache.resp.fire && !io.uncache.resp.bits.nc) { + mmioState := s_wb when (io.uncache.resp.bits.nderr) { uncacheUop.exceptionVec(storeAccessFault) := true.B @@ -819,26 +826,88 @@ class StoreQueue(implicit p: Parameters) extends XSModule is(s_wb) { when (io.mmioStout.fire || io.vecmmioStout.fire) { when (uncacheUop.exceptionVec(storeAccessFault)) { - uncacheState := s_idle + mmioState := s_idle }.otherwise { - uncacheState := s_wait + mmioState := s_wait } } } is(s_wait) { // A MMIO store can always move cmtPtrExt as it must be ROB head when(scommit > 0.U) { - uncacheState := s_idle // ready for next mmio + mmioState := s_idle // ready for next mmio + } + } + } + + mmioReq.valid := mmioState === s_req + mmioReq.bits := DontCare + mmioReq.bits.cmd := MemoryOpConstants.M_XWR + mmioReq.bits.addr := paddrModule.io.rdata(0) // data(deqPtr) -> rdata(0) + mmioReq.bits.vaddr:= vaddrModule.io.rdata(0) + mmioReq.bits.data := shiftDataToLow(paddrModule.io.rdata(0), dataModule.io.rdata(0).data) + mmioReq.bits.mask := shiftMaskToLow(paddrModule.io.rdata(0), dataModule.io.rdata(0).mask) + mmioReq.bits.atomic := atomic(GatedRegNext(rdataPtrExtNext(0)).value) + mmioReq.bits.nc := false.B + mmioReq.bits.id := rdataPtrExt(0).value + + /** + * NC Store + * (1) req: when it has been commited, it can be sent to lower level. + * (2) resp: because SQ data forward is required, it can only be deq when ncResp is received + */ + // TODO: CAN NOT deal with vector nc now! + val nc_idle :: nc_req :: nc_resp :: Nil = Enum(3) + val ncState = RegInit(nc_idle) + val rptr0 = rdataPtrExt(0).value + switch(ncState){ + is(nc_idle) { + when(nc(rptr0) && allocated(rptr0) && committed(rptr0) && !mmio(rptr0) && !isVec(rptr0)) { + ncState := nc_req + } + } + is(nc_req) { + when(ncDoReq) { + when(io.uncacheOutstanding) { + ncState := nc_idle + }.otherwise{ + ncState := nc_resp + } + } + } + is(nc_resp) { + when(ncResp.fire) { + ncState := nc_idle } } } - io.uncache.req.valid := uncacheState === s_req - io.uncache.req.bits := DontCare - io.uncache.req.bits.cmd := MemoryOpConstants.M_XWR - io.uncache.req.bits.addr := paddrModule.io.rdata(0) // data(deqPtr) -> rdata(0) - io.uncache.req.bits.data := shiftDataToLow(paddrModule.io.rdata(0), dataModule.io.rdata(0).data) - io.uncache.req.bits.mask := shiftMaskToLow(paddrModule.io.rdata(0), dataModule.io.rdata(0).mask) + ncDoReq := io.uncache.req.fire && io.uncache.req.bits.nc + ncDoResp := ncResp.fire + + ncReq.valid := ncState === nc_req + ncReq.bits := DontCare + ncReq.bits.cmd := MemoryOpConstants.M_XWR + ncReq.bits.addr := paddrModule.io.rdata(0) + ncReq.bits.vaddr:= vaddrModule.io.rdata(0) + ncReq.bits.data := shiftDataToLow(paddrModule.io.rdata(0), dataModule.io.rdata(0).data) + ncReq.bits.mask := shiftMaskToLow(paddrModule.io.rdata(0), dataModule.io.rdata(0).mask) + ncReq.bits.atomic := atomic(GatedRegNext(rdataPtrExtNext(0)).value) + ncReq.bits.nc := true.B + ncReq.bits.id := rptr0 + + ncResp.ready := io.uncache.resp.ready + ncResp.valid := io.uncache.resp.fire && io.uncache.resp.bits.nc + ncResp.bits <> io.uncache.resp.bits + when (ncDeqTrigger) { + allocated(ncPtr) := false.B + XSDebug("nc fire: ptr %d\n", ncPtr) + } + + mmioReq.ready := io.uncache.req.ready + ncReq.ready := io.uncache.req.ready && !mmioReq.valid + io.uncache.req.valid := mmioReq.valid || ncReq.valid + io.uncache.req.bits := Mux(mmioReq.valid, mmioReq.bits, ncReq.bits) // CBO op type check can be delayed for 1 cycle, // as uncache op will not start in s_idle @@ -849,36 +918,34 @@ class StoreQueue(implicit p: Parameters) extends XSModule io.uncache.req.valid := false.B when (io.cmoOpReq.fire) { - uncacheState := s_resp + mmioState := s_resp } - when (uncacheState === s_resp) { + when (mmioState === s_resp) { when (io.cmoOpResp.fire) { - uncacheState := s_wb + mmioState := s_wb } } } - io.cmoOpReq.valid := deqCanDoCbo && cboFlushedSb && (uncacheState === s_req) + io.cmoOpReq.valid := deqCanDoCbo && cboFlushedSb && (mmioState === s_req) io.cmoOpReq.bits.opcode := cmoOpCode io.cmoOpReq.bits.address := cboMmioAddr - io.cmoOpResp.ready := deqCanDoCbo && (uncacheState === s_resp) + io.cmoOpResp.ready := deqCanDoCbo && (mmioState === s_resp) - io.flushSbuffer.valid := deqCanDoCbo && !cboFlushedSb && (uncacheState === s_req) && !io.flushSbuffer.empty + io.flushSbuffer.valid := deqCanDoCbo && !cboFlushedSb && (mmioState === s_req) && !io.flushSbuffer.empty - when(deqCanDoCbo && !cboFlushedSb && (uncacheState === s_req) && io.flushSbuffer.empty) { + when(deqCanDoCbo && !cboFlushedSb && (mmioState === s_req) && io.flushSbuffer.empty) { cboFlushedSb := true.B } - io.uncache.req.bits.atomic := atomic(GatedRegNext(rdataPtrExtNext(0)).value) - - when(io.uncache.req.fire){ + when(mmioDoReq){ // mmio store should not be committed until uncache req is sent pending(deqPtr) := false.B XSDebug( - p"uncache req: pc ${Hexadecimal(uop(deqPtr).pc)} " + + p"uncache mmio req: pc ${Hexadecimal(uop(deqPtr).pc)} " + p"addr ${Hexadecimal(io.uncache.req.bits.addr)} " + p"data ${Hexadecimal(io.uncache.req.bits.data)} " + p"op ${Hexadecimal(io.uncache.req.bits.cmd)} " + @@ -890,7 +957,7 @@ class StoreQueue(implicit p: Parameters) extends XSModule io.uncache.resp.ready := true.B // (4) scalar store: writeback to ROB (and other units): mark as writebacked - io.mmioStout.valid := uncacheState === s_wb && !isVec(deqPtr) + io.mmioStout.valid := mmioState === s_wb && !isVec(deqPtr) io.mmioStout.bits.uop := uncacheUop io.mmioStout.bits.uop.exceptionVec := ExceptionNO.selectByFu(uncacheUop.exceptionVec, StaCfg) io.mmioStout.bits.uop.sqIdx := deqPtrExt(0) @@ -898,6 +965,7 @@ class StoreQueue(implicit p: Parameters) extends XSModule io.mmioStout.bits.data := shiftDataToLow(paddrModule.io.rdata(0), dataModule.io.rdata(0).data) // dataModule.io.rdata.read(deqPtr) io.mmioStout.bits.isFromLoadUnit := DontCare io.mmioStout.bits.debug.isMMIO := true.B + io.mmioStout.bits.debug.isNC := false.B io.mmioStout.bits.debug.paddr := DontCare io.mmioStout.bits.debug.isPerfCnt := false.B io.mmioStout.bits.debug.vaddr := DontCare @@ -916,11 +984,12 @@ class StoreQueue(implicit p: Parameters) extends XSModule // (4) or vector store: // TODO: implement it! io.vecmmioStout := DontCare - io.vecmmioStout.valid := false.B //uncacheState === s_wb && isVec(deqPtr) + io.vecmmioStout.valid := false.B //mmioState === s_wb && isVec(deqPtr) io.vecmmioStout.bits.uop := uop(deqPtr) io.vecmmioStout.bits.uop.sqIdx := deqPtrExt(0) io.vecmmioStout.bits.data := shiftDataToLow(paddrModule.io.rdata(0), dataModule.io.rdata(0).data) // dataModule.io.rdata.read(deqPtr) io.vecmmioStout.bits.debug.isMMIO := true.B + io.vecmmioStout.bits.debug.isNC := false.B io.vecmmioStout.bits.debug.paddr := DontCare io.vecmmioStout.bits.debug.isPerfCnt := false.B io.vecmmioStout.bits.debug.vaddr := DontCare @@ -936,7 +1005,7 @@ class StoreQueue(implicit p: Parameters) extends XSModule * (1) When store commits, mark it as committed. * (2) They will not be cancelled and can be sent to lower level. */ - XSError(uncacheState =/= s_idle && uncacheState =/= s_wait && commitCount > 0.U, + XSError(mmioState =/= s_idle && mmioState =/= s_wait && commitCount > 0.U, "should not commit instruction when MMIO has not been finished\n") val commitVec = WireInit(VecInit(Seq.fill(CommitWidth)(false.B))) @@ -947,10 +1016,15 @@ class StoreQueue(implicit p: Parameters) extends XSModule // TODO: Deal with vector store mmio for (i <- 0 until CommitWidth) { // don't mark misalign store as committed - when (allocated(cmtPtrExt(i).value) && !unaligned(cmtPtrExt(i).value) && isNotAfter(uop(cmtPtrExt(i).value).robIdx, GatedRegNext(io.rob.pendingPtr)) && !needCancel(cmtPtrExt(i).value) && (!waitStoreS2(cmtPtrExt(i).value) || isVec(cmtPtrExt(i).value))) { + when ( + allocated(cmtPtrExt(i).value) && + !unaligned(cmtPtrExt(i).value) && + isNotAfter(uop(cmtPtrExt(i).value).robIdx, GatedRegNext(io.rob.pendingPtr)) && + !needCancel(cmtPtrExt(i).value) && + (!waitStoreS2(cmtPtrExt(i).value) || isVec(cmtPtrExt(i).value))) { if (i == 0){ // TODO: fixme for vector mmio - when ((uncacheState === s_idle) || (uncacheState === s_wait && scommit > 0.U)){ + when ((mmioState === s_idle) || (mmioState === s_wait && scommit > 0.U)){ when ((isVec(cmtPtrExt(i).value) && vecMbCommit(cmtPtrExt(i).value)) || !isVec(cmtPtrExt(i).value)) { committed(cmtPtrExt(0).value) := true.B commitVec(0) := true.B @@ -968,15 +1042,21 @@ class StoreQueue(implicit p: Parameters) extends XSModule commitCount := PopCount(commitVec) cmtPtrExt := cmtPtrExt.map(_ + commitCount) - // committed stores will not be cancelled and can be sent to lower level. - // remove retired insts from sq, add retired store to sbuffer - - // Read data from data module - // As store queue grows larger and larger, time needed to read data from data - // module keeps growing higher. Now we give data read a whole cycle. + /** + * committed stores will not be cancelled and can be sent to lower level. + * + * 1. Store NC: Read data to uncache + * implement as above + * + * 2. Store Cache: Read data from data module + * remove retired insts from sq, add retired store to sbuffer. + * as store queue grows larger and larger, time needed to read data from data + * module keeps growing higher. Now we give data read a whole cycle. + */ for (i <- 0 until EnsbufferWidth) { val ptr = rdataPtrExt(i).value val mmioStall = if(i == 0) mmio(rdataPtrExt(0).value) else (mmio(rdataPtrExt(i).value) || mmio(rdataPtrExt(i-1).value)) + val ncStall = if(i == 0) nc(rdataPtrExt(0).value) else (nc(rdataPtrExt(i).value) || nc(rdataPtrExt(i-1).value)) val exceptionValid = if(i == 0) hasException(rdataPtrExt(0).value) else { hasException(rdataPtrExt(i).value) || (hasException(rdataPtrExt(i-1).value) && uop(rdataPtrExt(i).value).robIdx === uop(rdataPtrExt(i-1).value).robIdx) } @@ -988,13 +1068,13 @@ class StoreQueue(implicit p: Parameters) extends XSModule dataBuffer.io.enq(i).valid := Mux( doMisalignSt, io.maControl.control.writeSb, - allocated(ptr) && committed(ptr) && ((!isVec(ptr) && (allvalid(ptr) || hasException(ptr))) || vecMbCommit(ptr)) && !mmioStall + allocated(ptr) && committed(ptr) && ((!isVec(ptr) && (allvalid(ptr) || hasException(ptr))) || vecMbCommit(ptr)) && !mmioStall && !ncStall ) } else { dataBuffer.io.enq(i).valid := Mux( doMisalignSt, false.B, - allocated(ptr) && committed(ptr) && ((!isVec(ptr) && (allvalid(ptr) || hasException(ptr))) || vecMbCommit(ptr)) && !mmioStall + allocated(ptr) && committed(ptr) && ((!isVec(ptr) && (allvalid(ptr) || hasException(ptr))) || vecMbCommit(ptr)) && !mmioStall && !ncStall ) } // Note that store data/addr should both be valid after store's commit @@ -1039,8 +1119,9 @@ class StoreQueue(implicit p: Parameters) extends XSModule // Flags are used to record whether there are any exceptions when the queue is displayed. // This is determined each time a write is made to the 'databuffer', prevent subsequent uop of the same instruction from writing to the 'dataBuffer'. val vecCommitHasException = (0 until EnsbufferWidth).map{ i => - val ptr = rdataPtrExt(i).value - val mmioStall = if(i == 0) mmio(rdataPtrExt(0).value) else (mmio(rdataPtrExt(i).value) || mmio(rdataPtrExt(i-1).value)) + val ptr = rdataPtrExt(i).value + val mmioStall = if(i == 0) mmio(rdataPtrExt(0).value) else (mmio(rdataPtrExt(i).value) || mmio(rdataPtrExt(i-1).value)) + val ncStall = if(i == 0) nc(rdataPtrExt(0).value) else (nc(rdataPtrExt(i).value) || nc(rdataPtrExt(i-1).value)) val exceptionVliad = isVec(ptr) && hasException(ptr) && dataBuffer.io.enq(i).fire (exceptionVliad, uop(ptr), vecLastFlow(ptr)) } @@ -1070,9 +1151,8 @@ class StoreQueue(implicit p: Parameters) extends XSModule val vecExceptionFlagCancel = (0 until EnsbufferWidth).map{ i => - val ptr = rdataPtrExt(i).value - val mmioStall = if(i == 0) mmio(rdataPtrExt(0).value) else (mmio(rdataPtrExt(i).value) || mmio(rdataPtrExt(i-1).value)) - val vecLastFlowCommit = vecLastFlow(ptr) && (uop(ptr).robIdx === vecExceptionFlag.bits.robIdx) && dataBuffer.io.enq(i).fire + val ptr = rdataPtrExt(i).value + val vecLastFlowCommit = vecLastFlow(ptr) && (uop(ptr).robIdx === vecExceptionFlag.bits.robIdx) && dataBuffer.io.enq(i).fire vecLastFlowCommit }.reduce(_ || _) @@ -1099,7 +1179,6 @@ class StoreQueue(implicit p: Parameters) extends XSModule if (env.EnableDifftest) { for (i <- 0 until EnsbufferWidth) { val ptr = rdataPtrExt(i).value - val mmioStall = if(i == 0) mmio(rdataPtrExt(0).value) else (mmio(rdataPtrExt(i).value) || mmio(rdataPtrExt(i-1).value)) difftestBuffer.get.io.enq(i).valid := dataBuffer.io.enq(i).valid difftestBuffer.get.io.enq(i).bits := uop(ptr) } @@ -1240,8 +1319,8 @@ class StoreQueue(implicit p: Parameters) extends XSModule val vecValidVec = WireInit(VecInit((0 until StoreQueueSize).map(i => allocated(i) && isVec(i)))) QueuePerf(StoreQueueSize, PopCount(vecValidVec), !allowEnqueue) io.sqFull := !allowEnqueue - XSPerfAccumulate("mmioCycle", uncacheState =/= s_idle) // lq is busy dealing with uncache req - XSPerfAccumulate("mmioCnt", io.uncache.req.fire) + XSPerfAccumulate("mmioCycle", mmioState =/= s_idle) // lq is busy dealing with uncache req + XSPerfAccumulate("mmioCnt", mmioDoReq) XSPerfAccumulate("mmio_wb_success", io.mmioStout.fire || io.vecmmioStout.fire) XSPerfAccumulate("mmio_wb_blocked", (io.mmioStout.valid && !io.mmioStout.ready) || (io.vecmmioStout.valid && !io.vecmmioStout.ready)) XSPerfAccumulate("validEntryCnt", distanceBetween(enqPtrExt(0), deqPtrExt(0))) @@ -1250,8 +1329,8 @@ class StoreQueue(implicit p: Parameters) extends XSModule val perfValidCount = distanceBetween(enqPtrExt(0), deqPtrExt(0)) val perfEvents = Seq( - ("mmioCycle ", uncacheState =/= s_idle), - ("mmioCnt ", io.uncache.req.fire), + ("mmioCycle ", mmioState =/= s_idle), + ("mmioCnt ", mmioDoReq), ("mmio_wb_success", io.mmioStout.fire || io.vecmmioStout.fire), ("mmio_wb_blocked", (io.mmioStout.valid && !io.mmioStout.ready) || (io.vecmmioStout.valid && !io.vecmmioStout.ready)), ("stq_1_4_valid ", (perfValidCount < (StoreQueueSize.U/4.U))), diff --git a/src/main/scala/xiangshan/mem/lsqueue/UncacheBuffer.scala b/src/main/scala/xiangshan/mem/lsqueue/UncacheBuffer.scala deleted file mode 100644 index ca583a41de..0000000000 --- a/src/main/scala/xiangshan/mem/lsqueue/UncacheBuffer.scala +++ /dev/null @@ -1,477 +0,0 @@ -/*************************************************************************************** -* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences -* Copyright (c) 2020-2021 Peng Cheng Laboratory -* -* XiangShan is licensed under Mulan PSL v2. -* You can use this software according to the terms and conditions of the Mulan PSL v2. -* You may obtain a copy of Mulan PSL v2 at: -* http://license.coscl.org.cn/MulanPSL2 -* -* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, -* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, -* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. -* -* See the Mulan PSL v2 for more details. -***************************************************************************************/ -package xiangshan.mem - -import chisel3._ -import chisel3.util._ -import org.chipsalliance.cde.config._ -import xiangshan._ -import xiangshan.backend.rob.{RobPtr, RobLsqIO} -import xiangshan.ExceptionNO._ -import xiangshan.cache._ -import utils._ -import utility._ -import xiangshan.backend.Bundles -import xiangshan.backend.Bundles.{DynInst, MemExuOutput} -import xiangshan.backend.fu.FuConfig.LduCfg -import xiangshan.backend.HasMemBlockParameters - -class UncacheBufferEntry(entryIndex: Int)(implicit p: Parameters) extends XSModule - with HasCircularQueuePtrHelper - with HasLoadHelper -{ - val io = IO(new Bundle() { - val id = Input(UInt()) - - val redirect = Flipped(Valid(new Redirect)) - - // client requests - val req = Flipped(Valid(new LqWriteBundle)) - - // writeback mmio data - val ldout = DecoupledIO(new MemExuOutput) - val ld_raw_data = Output(new LoadDataFromLQBundle) - - // rob: uncache commit - val rob = Flipped(new RobLsqIO) - - // uncache io - val uncache = new UncacheWordIO - - // select this entry - val select = Output(Bool()) - - // flush this entry - val flush = Output(Bool()) - - // exception generated by outer bus - val exception = Valid(new LqWriteBundle) - }) - - val req_valid = RegInit(false.B) - val req = Reg(new LqWriteBundle) - - // - val s_idle :: s_req :: s_resp :: s_wait :: Nil = Enum(4) - val uncacheState = RegInit(s_idle) - val uncacheData = Reg(io.uncache.resp.bits.data.cloneType) - val nderr = RegInit(false.B) - - // enqueue - when (req_valid && req.uop.robIdx.needFlush(io.redirect)) { - req_valid := false.B - } .elsewhen (io.req.valid) { - XSError(req_valid, p"UncacheBuffer: You can not write an valid entry: $entryIndex") - req_valid := true.B - req := io.req.bits - nderr := false.B - } .elsewhen (io.ldout.fire) { - req_valid := false.B - } - - io.flush := req_valid && req.uop.robIdx.needFlush(io.redirect) - /** - * Memory mapped IO / other uncached operations - * - * States: - * (1) writeback from store units: mark as pending - * (2) when they reach ROB's head, they can be sent to uncache channel - * (3) response from uncache channel: mark as datavalid - * (4) writeback to ROB (and other units): mark as writebacked - * (5) ROB commits the instruction: same as normal instructions - */ - - io.rob.mmio := DontCare - io.rob.uop := DontCare - val pendingld = GatedValidRegNext(io.rob.pendingUncacheld) - val pendingPtr = GatedRegNext(io.rob.pendingPtr) - - switch (uncacheState) { - is (s_idle) { - when (req_valid && pendingld && req.uop.robIdx === pendingPtr) { - uncacheState := s_req - } - } - is (s_req) { - when (io.uncache.req.fire) { - uncacheState := s_resp - } - } - is (s_resp) { - when (io.uncache.resp.fire) { - uncacheState := s_wait - } - } - is (s_wait) { - when (io.ldout.fire) { - uncacheState := s_idle // ready for next mmio - } - } - } - - io.select := uncacheState =/= s_idle - - io.uncache.req.valid := uncacheState === s_req - io.uncache.req.bits := DontCare - io.uncache.req.bits.cmd := MemoryOpConstants.M_XRD - io.uncache.req.bits.data := DontCare - io.uncache.req.bits.addr := req.paddr - io.uncache.req.bits.mask := Mux(req.paddr(3), req.mask(15, 8), req.mask(7, 0)) - io.uncache.req.bits.id := io.id - io.uncache.req.bits.instrtype := DontCare - io.uncache.req.bits.replayCarry := DontCare - io.uncache.req.bits.atomic := true.B - - io.uncache.resp.ready := true.B - - when (io.uncache.req.fire) { - XSDebug("uncache req: pc %x addr %x data %x op %x mask %x\n", - req.uop.pc, - io.uncache.req.bits.addr, - io.uncache.req.bits.data, - io.uncache.req.bits.cmd, - io.uncache.req.bits.mask - ) - } - - // (3) response from uncache channel - when (io.uncache.resp.fire) { - uncacheData := io.uncache.resp.bits.data - nderr := io.uncache.resp.bits.nderr - } - - // uncache writeback - val selUop = req.uop - val func = selUop.fuOpType - val raddr = req.paddr - val rdataSel = LookupTree(raddr(2, 0), List( - "b000".U -> uncacheData(63, 0), - "b001".U -> uncacheData(63, 8), - "b010".U -> uncacheData(63, 16), - "b011".U -> uncacheData(63, 24), - "b100".U -> uncacheData(63, 32), - "b101".U -> uncacheData(63, 40), - "b110".U -> uncacheData(63, 48), - "b111".U -> uncacheData(63, 56) - )) - val rdataPartialLoad = rdataHelper(selUop, rdataSel) - - io.ldout.valid := (uncacheState === s_wait) - io.ldout.bits := DontCare - io.ldout.bits.uop := selUop - io.ldout.bits.uop.lqIdx := req.uop.lqIdx - io.ldout.bits.uop.exceptionVec(loadAccessFault) := nderr - io.ldout.bits.data := rdataPartialLoad - io.ldout.bits.debug.isMMIO := true.B - io.ldout.bits.debug.paddr := req.paddr - io.ldout.bits.debug.vaddr := req.vaddr - - io.ld_raw_data.lqData := uncacheData - io.ld_raw_data.uop := req.uop - io.ld_raw_data.addrOffset := req.paddr - - io.exception.valid := io.ldout.fire - io.exception.bits := req - io.exception.bits.uop.exceptionVec(loadAccessFault) := nderr - - - when (io.ldout.fire) { - req_valid := false.B - - XSInfo("int load miss write to cbd robidx %d lqidx %d pc 0x%x mmio %x\n", - io.ldout.bits.uop.robIdx.asUInt, - io.ldout.bits.uop.lqIdx.asUInt, - io.ldout.bits.uop.pc, - true.B - ) - } - - // end -} - -class UncacheBuffer(implicit p: Parameters) extends XSModule - with HasCircularQueuePtrHelper - with HasMemBlockParameters { - val io = IO(new Bundle() { - // control - val redirect = Flipped(Valid(new Redirect)) - - // - val req = Vec(LoadPipelineWidth, Flipped(Valid(new LqWriteBundle))) - - // writeback mmio data - val ldout = Vec(LoadPipelineWidth, DecoupledIO(new MemExuOutput)) - val ld_raw_data = Vec(LoadPipelineWidth, Output(new LoadDataFromLQBundle)) - - // rob: uncache commit - val rob = Flipped(new RobLsqIO) - - // uncache io - val uncache = new UncacheWordIO - - // rollback from frontend when uncache buffer is full - val rollback = Output(Valid(new Redirect)) - - // exception generated by outer bus - val exception = Valid(new LqWriteBundle) - }) - - val entries = Seq.tabulate(LoadUncacheBufferSize)(i => Module(new UncacheBufferEntry(i))) - - // freelist: store valid entries index. - // +---+---+--------------+-----+-----+ - // | 0 | 1 | ...... | n-2 | n-1 | - // +---+---+--------------+-----+-----+ - val freeList = Module(new FreeList( - size = LoadUncacheBufferSize, - allocWidth = LoadPipelineWidth, - freeWidth = 4, - enablePreAlloc = true, - moduleName = "UncacheBuffer freelist" - )) - freeList.io := DontCare - - // set enqueue default - entries.foreach { - case (e) => - e.io.req.valid := false.B - e.io.req.bits := DontCare - } - - // set uncache default - io.uncache.req.valid := false.B - io.uncache.req.bits := DontCare - io.uncache.resp.ready := false.B - - entries.foreach { - case (e) => - e.io.uncache.req.ready := false.B - e.io.uncache.resp.valid := false.B - e.io.uncache.resp.bits := DontCare - } - - // set writeback default - for (w <- 0 until LoadPipelineWidth) { - io.ldout(w).valid := false.B - io.ldout(w).bits := DontCare - io.ld_raw_data(w) := DontCare - } - - // enqueue - // s1: - val s1_req = VecInit(io.req.map(_.bits)) - val s1_valid = VecInit(io.req.map(_.valid)) - - // s2: enqueue - val s2_req = (0 until LoadPipelineWidth).map(i => { - RegEnable(s1_req(i), s1_valid(i))}) - val s2_valid = (0 until LoadPipelineWidth).map(i => { - RegNext(s1_valid(i)) && - !s2_req(i).uop.robIdx.needFlush(RegNext(io.redirect)) && - !s2_req(i).uop.robIdx.needFlush(io.redirect) - }) - val s2_has_exception = s2_req.map(x => ExceptionNO.selectByFu(x.uop.exceptionVec, LduCfg).asUInt.orR) - val s2_need_replay = s2_req.map(_.rep_info.need_rep) - - val s2_enqueue = Wire(Vec(LoadPipelineWidth, Bool())) - for (w <- 0 until LoadPipelineWidth) { - s2_enqueue(w) := s2_valid(w) && !s2_has_exception(w) && !s2_need_replay(w) && s2_req(w).mmio - } - - // - val enqValidVec = Wire(Vec(LoadPipelineWidth, Bool())) - val enqIndexVec = Wire(Vec(LoadPipelineWidth, UInt())) - - for (w <- 0 until LoadPipelineWidth) { - freeList.io.allocateReq(w) := true.B - } - - // freeList real-allocate - for (w <- 0 until LoadPipelineWidth) { - freeList.io.doAllocate(w) := enqValidVec(w) - } - - for (w <- 0 until LoadPipelineWidth) { - enqValidVec(w) := s2_enqueue(w) && freeList.io.canAllocate(w) - - val offset = PopCount(s2_enqueue.take(w)) - enqIndexVec(w) := freeList.io.allocateSlot(offset) - } - - // - val uncacheReq = Wire(DecoupledIO(io.uncache.req.bits.cloneType)) - val ldout = Wire(DecoupledIO(io.ldout(0).bits.cloneType)) - val ld_raw_data = Wire(io.ld_raw_data(0).cloneType) - - // init - uncacheReq.valid := false.B - uncacheReq.bits := DontCare - ldout.valid := false.B - ldout.bits := DontCare - ld_raw_data := DontCare - - entries.zipWithIndex.foreach { - case (e, i) => - e.io.redirect <> io.redirect - e.io.id := i.U - - // enqueue - for (w <- 0 until LoadPipelineWidth) { - when (enqValidVec(w) && (i.U === enqIndexVec(w))) { - e.io.req.valid := true.B - e.io.req.bits := s2_req(w) - } - } - - // uncache logic - e.io.rob <> io.rob - e.io.uncache.req.ready := uncacheReq.ready - e.io.ldout.ready := ldout.ready - - when (e.io.select) { - uncacheReq.valid := e.io.uncache.req.valid - uncacheReq.bits := e.io.uncache.req.bits - - ldout.valid := e.io.ldout.valid - ldout.bits := e.io.ldout.bits - ld_raw_data := e.io.ld_raw_data - // Read vaddr for mem exception - // no inst will be commited 1 cycle before tval update - // read vaddr for mmio, and only port 0 is used - } - - when (i.U === io.uncache.resp.bits.id) { - e.io.uncache.resp <> io.uncache.resp - } - } - - // uncache Request - AddPipelineReg(uncacheReq, io.uncache.req, false.B) - - // uncache Writeback - AddPipelineReg(ldout, io.ldout(UncacheWBPort), false.B) - - // uncache exception - io.exception.valid := Cat(entries.map(_.io.exception.valid)).orR - io.exception.bits := ParallelPriorityMux(entries.map(e => - (e.io.exception.valid, e.io.exception.bits) - )) - - io.ld_raw_data(UncacheWBPort) := RegEnable(ld_raw_data, ldout.fire) - - for (i <- 0 until LoadPipelineWidth) { - io.rob.mmio(i) := RegNext(s1_valid(i) && s1_req(i).mmio) - io.rob.uop(i) := RegEnable(s1_req(i).uop, s1_valid(i)) - } - - // UncacheBuffer deallocate - val freeMaskVec = Wire(Vec(LoadUncacheBufferSize, Bool())) - - // init - freeMaskVec.map(e => e := false.B) - - // dealloc logic - entries.zipWithIndex.foreach { - case (e, i) => - when ((e.io.select && e.io.ldout.fire) || e.io.flush) { - freeMaskVec(i) := true.B - } - } - - freeList.io.free := freeMaskVec.asUInt - - /** - * Uncache rollback detection - * - * When uncache loads enqueue, it searches uncache loads, They can not enqueue and need re-execution. - * - * Cycle 0: uncache enqueue. - * Cycle 1: Select oldest uncache loads. - * Cycle 2: Redirect Fire. - * Choose the oldest load from LoadPipelineWidth oldest loads. - * Prepare redirect request according to the detected rejection. - * Fire redirect request (if valid) - */ - // Load_S3 .... Load_S3 - // stage 0: lq lq - // | | (can not enqueue) - // stage 1: lq lq - // | | - // --------------- - // | - // stage 2: lq - // | - // rollback req - def selectOldestRedirect(xs: Seq[Valid[Redirect]]): Vec[Bool] = { - val compareVec = (0 until xs.length).map(i => (0 until i).map(j => isAfter(xs(j).bits.robIdx, xs(i).bits.robIdx))) - val resultOnehot = VecInit((0 until xs.length).map(i => Cat((0 until xs.length).map(j => - (if (j < i) !xs(j).valid || compareVec(i)(j) - else if (j == i) xs(i).valid - else !xs(j).valid || !compareVec(j)(i)) - )).andR)) - resultOnehot - } - val reqNeedCheck = VecInit((0 until LoadPipelineWidth).map(w => - s2_enqueue(w) && !enqValidVec(w) - )) - val reqSelUops = VecInit(s2_req.map(_.uop)) - val allRedirect = (0 until LoadPipelineWidth).map(i => { - val redirect = Wire(Valid(new Redirect)) - redirect.valid := reqNeedCheck(i) - redirect.bits := DontCare - redirect.bits.isRVC := reqSelUops(i).preDecodeInfo.isRVC - redirect.bits.robIdx := reqSelUops(i).robIdx - redirect.bits.ftqIdx := reqSelUops(i).ftqPtr - redirect.bits.ftqOffset := reqSelUops(i).ftqOffset - redirect.bits.level := RedirectLevel.flush - redirect.bits.cfiUpdate.target := reqSelUops(i).pc // TODO: check if need pc - redirect.bits.debug_runahead_checkpoint_id := reqSelUops(i).debugInfo.runahead_checkpoint_id - redirect - }) - val oldestOneHot = selectOldestRedirect(allRedirect) - val oldestRedirect = Mux1H(oldestOneHot, allRedirect) - val lastCycleRedirect = Wire(Valid(new Redirect)) - lastCycleRedirect.valid := RegNext(io.redirect.valid) - lastCycleRedirect.bits := RegEnable(io.redirect.bits, io.redirect.valid) - val lastLastCycleRedirect = Wire(Valid(new Redirect)) - lastLastCycleRedirect.valid := RegNext(lastCycleRedirect.valid) - lastLastCycleRedirect.bits := RegEnable(lastCycleRedirect.bits, lastCycleRedirect.valid) - io.rollback.valid := GatedValidRegNext(oldestRedirect.valid && - !oldestRedirect.bits.robIdx.needFlush(io.redirect) && - !oldestRedirect.bits.robIdx.needFlush(lastCycleRedirect) && - !oldestRedirect.bits.robIdx.needFlush(lastLastCycleRedirect)) - io.rollback.bits := RegEnable(oldestRedirect.bits, oldestRedirect.valid) - - // perf counter - val validCount = freeList.io.validCount - val allowEnqueue = !freeList.io.empty - QueuePerf(LoadUncacheBufferSize, validCount, !allowEnqueue) - - XSPerfAccumulate("mmioCycle", VecInit(entries.map(_.io.select)).asUInt.orR) - XSPerfAccumulate("mmioCnt", io.uncache.req.fire) - XSPerfAccumulate("mmio_writeback_success", io.ldout(0).fire) - XSPerfAccumulate("mmio_writeback_blocked", io.ldout(0).valid && !io.ldout(0).ready) - XSPerfAccumulate("uncache_full_rollback", io.rollback.valid) - - val perfEvents: Seq[(String, UInt)] = Seq( - ("mmioCycle", VecInit(entries.map(_.io.select)).asUInt.orR), - ("mmioCnt", io.uncache.req.fire), - ("mmio_writeback_success", io.ldout(0).fire), - ("mmio_writeback_blocked", io.ldout(0).valid && !io.ldout(0).ready), - ("uncache_full_rollback", io.rollback.valid) - ) - // end -} diff --git a/src/main/scala/xiangshan/mem/pipeline/AtomicsUnit.scala b/src/main/scala/xiangshan/mem/pipeline/AtomicsUnit.scala index d5f6c346c9..8ef5afcffb 100644 --- a/src/main/scala/xiangshan/mem/pipeline/AtomicsUnit.scala +++ b/src/main/scala/xiangshan/mem/pipeline/AtomicsUnit.scala @@ -31,6 +31,7 @@ import xiangshan.backend.fu.FuType import xiangshan.backend.Bundles.{MemExuInput, MemExuOutput} import xiangshan.backend.fu.NewCSR.TriggerUtil import xiangshan.backend.fu.util.SdtrigExt +import xiangshan.cache.mmu.Pbmt class AtomicsUnit(implicit p: Parameters) extends XSModule with MemoryOpConstants @@ -72,6 +73,7 @@ class AtomicsUnit(implicit p: Parameters) extends XSModule val gpaddr = Reg(UInt()) val vaddr = in.src(0) val is_mmio = Reg(Bool()) + val is_nc = RegInit(false.B) val isForVSnonLeafPTE = Reg(Bool()) // dcache response data @@ -236,6 +238,7 @@ class AtomicsUnit(implicit p: Parameters) extends XSModule trigger := triggerAction when (!io.dtlb.resp.bits.miss) { + is_nc := Pbmt.isNC(io.dtlb.resp.bits.pbmt(0)) io.out.bits.uop.debugInfo.tlbRespTime := GTimer() when (!addrAligned || triggerDebugMode || triggerBreakpoint) { // NOTE: when addrAligned or trigger fire, do not need to wait tlb actually @@ -251,9 +254,10 @@ class AtomicsUnit(implicit p: Parameters) extends XSModule } } + val pbmtReg = RegEnable(io.dtlb.resp.bits.pbmt(0), io.dtlb.resp.fire && !io.dtlb.resp.bits.miss) when (state === s_pm) { val pmp = WireInit(io.pmpResp) - is_mmio := pmp.mmio + is_mmio := Pbmt.isIO(pbmtReg) || (Pbmt.isPMA(pbmtReg) && pmp.mmio) // NOTE: only handle load/store exception here, if other exception happens, don't send here val exception_va = exceptionVec(storePageFault) || exceptionVec(loadPageFault) || @@ -398,6 +402,7 @@ class AtomicsUnit(implicit p: Parameters) extends XSModule io.out.bits.uop.fuType := FuType.mou.U io.out.bits.data := resp_data io.out.bits.debug.isMMIO := is_mmio + io.out.bits.debug.isNC := is_nc io.out.bits.debug.paddr := paddr when (io.out.fire) { XSDebug("atomics writeback: pc %x data %x\n", io.out.bits.uop.pc, io.dcache.resp.bits.data) diff --git a/src/main/scala/xiangshan/mem/pipeline/HybridUnit.scala b/src/main/scala/xiangshan/mem/pipeline/HybridUnit.scala index e191f4addb..5df3af828a 100644 --- a/src/main/scala/xiangshan/mem/pipeline/HybridUnit.scala +++ b/src/main/scala/xiangshan/mem/pipeline/HybridUnit.scala @@ -62,6 +62,7 @@ class HybridUnit(implicit p: Parameters) extends XSModule // data path val sbuffer = new LoadForwardQueryIO + val ubuffer = new LoadForwardQueryIO val vec_forward = new LoadForwardQueryIO val lsq = new LoadToLsqIO val tl_d_channel = Input(new DcacheToLduForwardIO) @@ -608,6 +609,14 @@ class HybridUnit(implicit p: Parameters) extends XSModule io.ldu_io.sbuffer.mask := s1_in.mask io.ldu_io.sbuffer.pc := s1_in.uop.pc // FIXME: remove it + io.ldu_io.ubuffer.valid := s1_valid && !(s1_exception || s1_tlb_miss || s1_kill || s1_fast_rep_kill || s1_prf || !s1_ld_flow) + io.ldu_io.ubuffer.vaddr := s1_vaddr + io.ldu_io.ubuffer.paddr := s1_paddr_dup_lsu + io.ldu_io.ubuffer.uop := s1_in.uop + io.ldu_io.ubuffer.sqIdx := s1_in.uop.sqIdx + io.ldu_io.ubuffer.mask := s1_in.mask + io.ldu_io.ubuffer.pc := s1_in.uop.pc // FIXME: remove it + io.ldu_io.vec_forward.valid := s1_valid && !(s1_exception || s1_tlb_miss || s1_kill || s1_fast_rep_kill || s1_prf || !s1_ld_flow) io.ldu_io.vec_forward.vaddr := s1_vaddr io.ldu_io.vec_forward.paddr := s1_paddr_dup_lsu @@ -970,16 +979,12 @@ class HybridUnit(implicit p: Parameters) extends XSModule s2_full_fwd := ((~s2_fwd_mask.asUInt).asUInt & s2_in.mask) === 0.U && !io.ldu_io.lsq.forward.dataInvalid && !io.ldu_io.vec_forward.dataInvalid // generate XLEN/8 Muxs for (i <- 0 until VLEN / 8) { - s2_fwd_mask(i) := io.ldu_io.lsq.forward.forwardMask(i) || io.ldu_io.sbuffer.forwardMask(i) || io.ldu_io.vec_forward.forwardMask(i) - s2_fwd_data(i) := Mux( - io.ldu_io.lsq.forward.forwardMask(i), - io.ldu_io.lsq.forward.forwardData(i), - Mux( - io.ldu_io.vec_forward.forwardMask(i), - io.ldu_io.vec_forward.forwardData(i), - io.ldu_io.sbuffer.forwardData(i) - ) - ) + s2_fwd_mask(i) := io.ldu_io.lsq.forward.forwardMask(i) || io.ldu_io.sbuffer.forwardMask(i) || io.ldu_io.vec_forward.forwardMask(i) || io.ldu_io.ubuffer.forwardMask(i) + s2_fwd_data(i) := + Mux(io.ldu_io.lsq.forward.forwardMask(i), io.ldu_io.lsq.forward.forwardData(i), + Mux(io.ldu_io.vec_forward.forwardMask(i), io.ldu_io.vec_forward.forwardData(i), + Mux(io.ldu_io.ubuffer.forwardMask(i), io.ldu_io.ubuffer.forwardData(i), + io.ldu_io.sbuffer.forwardData(i)))) } XSDebug(s2_fire && s2_ld_flow, "[FWD LOAD RESP] pc %x fwd %x(%b) + %x(%b)\n", @@ -1159,7 +1164,7 @@ class HybridUnit(implicit p: Parameters) extends XSModule io.ldu_io.fast_rep_out.bits.delayedLoadError := s3_dly_ld_err io.ldu_io.lsq.ldin.bits.dcacheRequireReplay := s3_dcache_rep - val s3_vp_match_fail = RegNext(io.ldu_io.lsq.forward.matchInvalid || io.ldu_io.sbuffer.matchInvalid) && s3_troublem + val s3_vp_match_fail = RegNext(io.ldu_io.lsq.forward.matchInvalid || io.ldu_io.sbuffer.matchInvalid || io.ldu_io.ubuffer.matchInvalid) && s3_troublem val s3_ldld_rep_inst = io.ldu_io.lsq.ldld_nuke_query.resp.valid && io.ldu_io.lsq.ldld_nuke_query.resp.bits.rep_frm_fetch && @@ -1191,6 +1196,7 @@ class HybridUnit(implicit p: Parameters) extends XSModule s3_out.bits.uop.replayInst := s3_rep_frm_fetch s3_out.bits.data := s3_in.data s3_out.bits.debug.isMMIO := s3_in.mmio + s3_out.bits.debug.isNC := s3_in.nc s3_out.bits.debug.isPerfCnt := false.B s3_out.bits.debug.paddr := s3_in.paddr s3_out.bits.debug.vaddr := s3_in.vaddr diff --git a/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala b/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala index 89a11ed5a7..ba2e7b702c 100644 --- a/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala +++ b/src/main/scala/xiangshan/mem/pipeline/LoadUnit.scala @@ -77,11 +77,18 @@ class LoadToLsqReplayIO(implicit p: Parameters) extends XSBundle class LoadToLsqIO(implicit p: Parameters) extends XSBundle { + // ldu -> lsq UncacheBuffer val ldin = DecoupledIO(new LqWriteBundle) + // uncache-mmio -> ldu val uncache = Flipped(DecoupledIO(new MemExuOutput)) val ld_raw_data = Input(new LoadDataFromLQBundle) + // uncache-nc -> ldu + val nc_ldin = Flipped(DecoupledIO(new LsPipelineBundle)) + // storequeue -> ldu val forward = new PipeLoadForwardQueryIO + // ldu -> lsq LQRAW val stld_nuke_query = new LoadNukeQueryIO + // ldu -> lsq LQRAR val ldld_nuke_query = new LoadNukeQueryIO } @@ -128,6 +135,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule val pmp = Flipped(new PMPRespBundle()) // arrive same to tlb now val dcache = new DCacheLoadIO val sbuffer = new LoadForwardQueryIO + val ubuffer = new LoadForwardQueryIO val lsq = new LoadToLsqIO val tl_d_channel = Input(new DcacheToLduForwardIO) val forward_mshr = Flipped(new LduToMissqueueForwardIO) @@ -207,10 +215,12 @@ class LoadUnit(implicit p: Parameters) extends XSModule // generate addr, use addr to query DCache and DTLB val s0_valid = Wire(Bool()) val s0_mmio_select = Wire(Bool()) + val s0_nc_select = Wire(Bool()) val s0_kill = Wire(Bool()) val s0_can_go = s1_ready val s0_fire = s0_valid && s0_can_go val s0_mmio_fire = s0_mmio_select && s0_can_go + val s0_nc_fire = s0_nc_select && s0_can_go val s0_out = Wire(new LqWriteBundle) val s0_tlb_valid = Wire(Bool()) val s0_tlb_hlv = Wire(Bool()) @@ -254,29 +264,34 @@ class LoadUnit(implicit p: Parameters) extends XSModule val elemIdxInsideVd = UInt(elemIdxBits.W) val alignedType = UInt(alignTypeBits.W) val vecBaseVaddr = UInt(VAddrBits.W) + //for Svpbmt NC + val isnc = Bool() + val paddr = UInt(PAddrBits.W) + val data = UInt((VLEN+1).W) } val s0_sel_src = Wire(new FlowSource) // load flow select/gen - // src0: misalignBuffer load (io.misalign_ldin) - // src1: super load replayed by LSQ (cache miss replay) (io.replay) - // src2: fast load replay (io.fast_rep_in) - // src3: mmio (io.lsq.uncache) - // src4: load replayed by LSQ (io.replay) - // src5: hardware prefetch from prefetchor (high confidence) (io.prefetch) + // src 0: misalignBuffer load (io.misalign_ldin) + // src 1: super load replayed by LSQ (cache miss replay) (io.replay) + // src 2: fast load replay (io.fast_rep_in) + // src 3: mmio (io.lsq.uncache) + // src 4: nc (io.lsq.nc_ldin) + // src 5: load replayed by LSQ (io.replay) + // src 6: hardware prefetch from prefetchor (high confidence) (io.prefetch) // NOTE: Now vec/int loads are sent from same RS // A vec load will be splited into multiple uops, // so as long as one uop is issued, // the other uops should have higher priority - // src6: vec read from RS (io.vecldin) - // src7: int read / software prefetch first issue from RS (io.in) - // src8: load try pointchaising when no issued or replayed load (io.fastpath) - // src9: hardware prefetch from prefetchor (high confidence) (io.prefetch) + // src 7: vec read from RS (io.vecldin) + // src 8: int read / software prefetch first issue from RS (io.in) + // src 9: load try pointchaising when no issued or replayed load (io.fastpath) + // src10: hardware prefetch from prefetchor (high confidence) (io.prefetch) // priority: high to low val s0_rep_stall = io.ldin.valid && isAfter(io.replay.bits.uop.robIdx, io.ldin.bits.uop.robIdx) - private val SRC_NUM = 10 + private val SRC_NUM = 11 private val Seq( - mab_idx, super_rep_idx, fast_rep_idx, mmio_idx, lsq_rep_idx, + mab_idx, super_rep_idx, fast_rep_idx, mmio_idx, nc_idx, lsq_rep_idx, high_pf_idx, vec_iss_idx, int_iss_idx, l2l_fwd_idx, low_pf_idx ) = (0 until SRC_NUM).toSeq // load flow source valid @@ -285,6 +300,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule io.replay.valid && io.replay.bits.forward_tlDchannel, io.fast_rep_in.valid, io.lsq.uncache.valid, + io.lsq.nc_ldin.valid, io.replay.valid && !io.replay.bits.forward_tlDchannel && !s0_rep_stall, io.prefetch_req.valid && io.prefetch_req.bits.confidence > 0.U, io.vecldin.valid, @@ -308,8 +324,10 @@ class LoadUnit(implicit p: Parameters) extends XSModule dontTouch(s0_src_select_vec) } - val s0_tlb_no_query = s0_hw_prf_select || s0_src_select_vec(fast_rep_idx) || s0_src_select_vec(mmio_idx) || s0_sel_src.prf_i - s0_valid := ( + val s0_tlb_no_query = s0_hw_prf_select || s0_sel_src.prf_i || + s0_src_select_vec(fast_rep_idx) || s0_src_select_vec(mmio_idx) || + s0_src_select_vec(nc_idx) + s0_valid := !s0_kill && (s0_src_select_vec(nc_idx) || (( s0_src_valid_vec(mab_idx) || s0_src_valid_vec(super_rep_idx) || s0_src_valid_vec(fast_rep_idx) || @@ -319,9 +337,13 @@ class LoadUnit(implicit p: Parameters) extends XSModule s0_src_valid_vec(int_iss_idx) || s0_src_valid_vec(l2l_fwd_idx) || s0_src_valid_vec(low_pf_idx) - ) && !s0_src_select_vec(mmio_idx) && io.dcache.req.ready && !s0_kill + ) && !s0_src_select_vec(mmio_idx) && io.dcache.req.ready)) s0_mmio_select := s0_src_select_vec(mmio_idx) && !s0_kill + s0_nc_select := s0_src_select_vec(nc_idx) && !s0_kill + //judgment: is NC with data or not. + //If true, it's from `io.lsq.nc_ldin` or `io.fast_rep_in` + val s0_nc_with_data = s0_sel_src.isnc && !s0_kill // if is hardware prefetch or fast replay, don't send valid to tlb s0_tlb_valid := ( @@ -367,7 +389,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule io.tlb.req.bits.debug.isFirstIssue := s0_sel_src.isFirstIssue // query DCache - io.dcache.req.valid := s0_valid && !s0_sel_src.prf_i + io.dcache.req.valid := s0_valid && !s0_sel_src.prf_i && !s0_nc_with_data io.dcache.req.bits.cmd := Mux(s0_sel_src.prf_rd, MemoryOpConstants.M_PFR, Mux(s0_sel_src.prf_wr, MemoryOpConstants.M_PFW, MemoryOpConstants.M_XRD) @@ -416,6 +438,8 @@ class LoadUnit(implicit p: Parameters) extends XSModule def fromFastReplaySource(src: LqWriteBundle): FlowSource = { val out = WireInit(0.U.asTypeOf(new FlowSource)) + out.vaddr := src.vaddr + out.paddr := src.paddr out.mask := src.mask out.uop := src.uop out.try_l2l := false.B @@ -443,6 +467,8 @@ class LoadUnit(implicit p: Parameters) extends XSModule out.elemIdx := src.elemIdx out.elemIdxInsideVd := src.elemIdxInsideVd out.alignedType := src.alignedType + out.isnc := src.nc + out.data := src.data out } @@ -469,6 +495,22 @@ class LoadUnit(implicit p: Parameters) extends XSModule out } + def fromNcSource(src: LsPipelineBundle): FlowSource = { + val out = WireInit(0.U.asTypeOf(new FlowSource)) + out.vaddr := src.vaddr + out.paddr := src.paddr + out.mask := genVWmask(src.vaddr, src.uop.fuOpType(1,0)) + out.uop := src.uop + out.has_rob_entry := true.B + out.sched_idx := src.schedIndex + out.isvec := src.isvec + out.is128bit := src.is128bit + out.vecActive := src.vecActive + out.isnc := true.B + out.data := src.data + out + } + def fromNormalReplaySource(src: LsPipelineBundle): FlowSource = { val out = WireInit(0.U.asTypeOf(new FlowSource)) out.mask := Mux(src.isvec, src.mask, genVWmask(src.vaddr, src.uop.fuOpType(1, 0))) @@ -622,6 +664,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule fromNormalReplaySource(io.replay.bits), fromFastReplaySource(io.fast_rep_in.bits), fromMmioSource(io.lsq.uncache.bits), + fromNcSource(io.lsq.nc_ldin.bits), fromNormalReplaySource(io.replay.bits), fromPrefetchSource(io.prefetch_req.bits), fromVecIssueSource(io.vecldin.bits), @@ -657,15 +700,11 @@ class LoadUnit(implicit p: Parameters) extends XSModule ) ) - s0_dcache_vaddr := Mux( - s0_src_select_vec(fast_rep_idx), - io.fast_rep_in.bits.vaddr, - Mux( - s0_hw_prf_select, - io.prefetch_req.bits.getVaddr(), - s0_tlb_vaddr - ) - ) + s0_dcache_vaddr := + Mux(s0_src_select_vec(fast_rep_idx), io.fast_rep_in.bits.vaddr, + Mux(s0_hw_prf_select, io.prefetch_req.bits.getVaddr(), + Mux(s0_src_select_vec(nc_idx), io.lsq.nc_ldin.bits.vaddr, // not for dcache access, but for address alignment check + s0_tlb_vaddr))) s0_tlb_hlv := Mux( s0_src_valid_vec(mab_idx), @@ -706,7 +745,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule // accept load flow if dcache ready (tlb is always ready) // TODO: prefetch need writeback to loadQueueFlag s0_out := DontCare - s0_out.vaddr := s0_dcache_vaddr + s0_out.vaddr := Mux(s0_nc_with_data, s0_sel_src.vaddr, s0_dcache_vaddr) s0_out.fullva := s0_tlb_fullva s0_out.mask := s0_sel_src.mask s0_out.uop := s0_sel_src.uop @@ -722,8 +761,11 @@ class LoadUnit(implicit p: Parameters) extends XSModule s0_out.is128bit := s0_sel_src.is128bit s0_out.isFrmMisAlignBuf := s0_sel_src.frm_mabuf s0_out.uop_unit_stride_fof := s0_sel_src.uop_unit_stride_fof - s0_out.paddr := Mux(s0_src_valid_vec(fast_rep_idx), io.fast_rep_in.bits.paddr, - Mux(s0_src_select_vec(int_iss_idx) && s0_sel_src.prf_i, 0.U, io.prefetch_req.bits.paddr)) // only for prefetch and fast_rep + s0_out.paddr := + Mux(s0_src_valid_vec(nc_idx), io.lsq.nc_ldin.bits.paddr, + Mux(s0_src_valid_vec(fast_rep_idx), io.fast_rep_in.bits.paddr, + Mux(s0_src_select_vec(int_iss_idx) && s0_sel_src.prf_i, 0.U, + io.prefetch_req.bits.paddr))) // only for nc, fast_rep, prefetch s0_out.tlbNoQuery := s0_tlb_no_query // s0_out.rob_idx_valid := s0_rob_idx_valid // s0_out.inner_idx := s0_inner_idx @@ -747,12 +789,16 @@ class LoadUnit(implicit p: Parameters) extends XSModule s0_out.uop.debugInfo.tlbFirstReqTime := s0_sel_src.uop.debugInfo.tlbFirstReqTime } s0_out.schedIndex := s0_sel_src.sched_idx + //for Svpbmt Nc + s0_out.nc := s0_sel_src.isnc + s0_out.data := s0_sel_src.data // load fast replay io.fast_rep_in.ready := (s0_can_go && io.dcache.req.ready && s0_src_ready_vec(fast_rep_idx)) // mmio io.lsq.uncache.ready := s0_mmio_fire + io.lsq.nc_ldin.ready := s0_src_ready_vec(nc_idx) && s0_can_go // load flow source ready // cache missed load has highest priority @@ -775,11 +821,12 @@ class LoadUnit(implicit p: Parameters) extends XSModule io.dcache.replacementUpdated := Mux(s0_src_select_vec(lsq_rep_idx) || s0_src_select_vec(super_rep_idx), io.replay.bits.replacementUpdated, false.B) // load wakeup - // TODO: vector load wakeup? + // TODO: vector load wakeup? frm_mabuf wakeup? val s0_wakeup_selector = Seq( s0_src_valid_vec(super_rep_idx), s0_src_valid_vec(fast_rep_idx), s0_mmio_fire, + s0_nc_fire, s0_src_valid_vec(lsq_rep_idx), s0_src_valid_vec(int_iss_idx) ) @@ -787,12 +834,18 @@ class LoadUnit(implicit p: Parameters) extends XSModule io.replay.bits.uop, io.fast_rep_in.bits.uop, io.lsq.uncache.bits.uop, + io.lsq.nc_ldin.bits.uop, io.replay.bits.uop, io.ldin.bits.uop, ) val s0_wakeup_uop = ParallelPriorityMux(s0_wakeup_selector, s0_wakeup_format) - io.wakeup.valid := s0_fire && !s0_sel_src.isvec && !s0_sel_src.frm_mabuf && - (s0_src_valid_vec(super_rep_idx) || s0_src_valid_vec(fast_rep_idx) || s0_src_valid_vec(lsq_rep_idx) || ((s0_src_valid_vec(int_iss_idx) && !s0_sel_src.prf) && !s0_src_valid_vec(vec_iss_idx) && !s0_src_valid_vec(high_pf_idx))) || s0_mmio_fire + io.wakeup.valid := s0_fire && !s0_sel_src.isvec && !s0_sel_src.frm_mabuf && ( + s0_src_valid_vec(super_rep_idx) || + s0_src_valid_vec(fast_rep_idx) || + s0_src_valid_vec(lsq_rep_idx) || + (s0_src_valid_vec(int_iss_idx) && !s0_sel_src.prf && + !s0_src_valid_vec(vec_iss_idx) && !s0_src_valid_vec(high_pf_idx)) + ) || s0_mmio_fire || s0_nc_fire io.wakeup.bits := s0_wakeup_uop // prefetch.i(Zicbop) @@ -818,6 +871,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule val s1_can_go = s2_ready val s1_fire = s1_valid && !s1_kill && s1_can_go val s1_vecActive = RegEnable(s0_out.vecActive, true.B, s0_fire) + val s1_nc_with_data = RegNext(s0_nc_with_data) s1_ready := !s1_valid || s1_kill || s2_ready when (s0_fire) { s1_valid := true.B } @@ -838,7 +892,8 @@ class LoadUnit(implicit p: Parameters) extends XSModule val s1_exception = ExceptionNO.selectByFu(s1_out.uop.exceptionVec, LduCfg).asUInt.orR // af & pf exception were modified below. val s1_tlb_miss = io.tlb.resp.bits.miss && io.tlb.resp.valid && s1_valid val s1_tlb_fast_miss = io.tlb.resp.bits.fastMiss && io.tlb.resp.valid && s1_valid - val s1_pbmt = Mux(io.tlb.resp.valid, io.tlb.resp.bits.pbmt(0), 0.U(2.W)) + val s1_pbmt = Mux(!s1_tlb_miss, io.tlb.resp.bits.pbmt.head, 0.U(Pbmt.width.W)) + val s1_nc = s1_in.nc val s1_prf = s1_in.isPrefetch val s1_hw_prf = s1_in.isHWPrefetch val s1_sw_prf = s1_prf && !s1_hw_prf @@ -874,6 +929,14 @@ class LoadUnit(implicit p: Parameters) extends XSModule io.sbuffer.mask := s1_in.mask io.sbuffer.pc := s1_in.uop.pc // FIXME: remove it + io.ubuffer.valid := s1_valid && s1_nc_with_data && !(s1_exception || s1_tlb_miss || s1_kill || s1_dly_err || s1_prf) + io.ubuffer.vaddr := s1_vaddr + io.ubuffer.paddr := s1_paddr_dup_lsu + io.ubuffer.uop := s1_in.uop + io.ubuffer.sqIdx := s1_in.uop.sqIdx + io.ubuffer.mask := s1_in.mask + io.ubuffer.pc := s1_in.uop.pc // FIXME: remove it + io.lsq.forward.valid := s1_valid && !(s1_exception || s1_tlb_miss || s1_kill || s1_dly_err || s1_prf) io.lsq.forward.vaddr := s1_vaddr io.lsq.forward.paddr := s1_paddr_dup_lsu @@ -908,6 +971,8 @@ class LoadUnit(implicit p: Parameters) extends XSModule s1_out.rep_info.debug := s1_in.uop.debugInfo s1_out.rep_info.nuke := s1_nuke && !s1_sw_prf s1_out.delayedLoadError := s1_dly_err + s1_out.nc := s1_nc || Pbmt.isNC(s1_pbmt) + s1_out.mmio := Pbmt.isIO(s1_pbmt) when (!s1_dly_err) { // current ori test will cause the case of ldest == 0, below will be modifeid in the future. @@ -945,10 +1010,14 @@ class LoadUnit(implicit p: Parameters) extends XSModule s1_redirect_reg.valid := GatedValidRegNext(io.redirect.valid) s1_kill := s1_fast_rep_dly_kill || - s1_cancel_ptr_chasing || - s1_in.uop.robIdx.needFlush(io.redirect) || - (s1_in.uop.robIdx.needFlush(s1_redirect_reg) && !GatedValidRegNext(s0_try_ptr_chasing)) || - RegEnable(s0_kill, false.B, io.ldin.valid || io.vecldin.valid || io.replay.valid || io.l2l_fwd_in.valid || io.fast_rep_in.valid || io.misalign_ldin.valid) + s1_cancel_ptr_chasing || + s1_in.uop.robIdx.needFlush(io.redirect) || + (s1_in.uop.robIdx.needFlush(s1_redirect_reg) && !GatedValidRegNext(s0_try_ptr_chasing)) || + RegEnable(s0_kill, false.B, io.ldin.valid || + io.vecldin.valid || io.replay.valid || + io.l2l_fwd_in.valid || io.fast_rep_in.valid || + io.misalign_ldin.valid || io.lsq.nc_ldin.valid + ) if (EnableLoadToLoadForward) { // Sometimes, we need to cancel the load-load forwarding. @@ -982,7 +1051,11 @@ class LoadUnit(implicit p: Parameters) extends XSModule s1_in.uop.debugInfo.tlbRespTime := GTimer() } when (!s1_cancel_ptr_chasing) { - s0_ptr_chasing_canceled := s1_try_ptr_chasing && !io.replay.fire && !io.fast_rep_in.fire && !(s0_src_valid_vec(high_pf_idx) && io.canAcceptHighConfPrefetch) && !io.misalign_ldin.fire + s0_ptr_chasing_canceled := s1_try_ptr_chasing && + !io.replay.fire && !io.fast_rep_in.fire && + !(s0_src_valid_vec(high_pf_idx) && io.canAcceptHighConfPrefetch) && + !io.misalign_ldin.fire && + !io.lsq.nc_ldin.valid when (s1_try_ptr_chasing) { io.ldin.ready := true.B } @@ -1048,6 +1121,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule val s2_frm_mabuf = s2_in.isFrmMisAlignBuf val s2_pbmt = RegEnable(s1_pbmt, s1_fire) val s2_trigger_debug_mode = RegEnable(s1_trigger_debug_mode, false.B, s1_fire) + val s2_nc_with_data = RegNext(s1_nc_with_data) s2_kill := s2_in.uop.robIdx.needFlush(io.redirect) s2_ready := !s2_valid || s2_kill || s3_ready @@ -1065,12 +1139,14 @@ class LoadUnit(implicit p: Parameters) extends XSModule // if such exception happen, that inst and its exception info // will be force writebacked to rob val s2_exception_vec = WireInit(s2_in.uop.exceptionVec) + val s2_actually_uncache = Pbmt.isPMA(s2_pbmt) && s2_pmp.mmio || s2_in.nc || s2_in.mmio when (!s2_in.delayedLoadError) { - s2_exception_vec(loadAccessFault) := (s2_in.uop.exceptionVec(loadAccessFault) || - s2_pmp.ld || - s2_isvec && s2_pmp.mmio && !s2_prf && !s2_in.tlbMiss || - (io.dcache.resp.bits.tag_error && GatedValidRegNext(io.csrCtrl.cache_error_enable)) - ) && s2_vecActive + s2_exception_vec(loadAccessFault) := s2_vecActive && ( + s2_in.uop.exceptionVec(loadAccessFault) || + s2_pmp.ld || + s2_isvec && s2_actually_uncache && !s2_prf && !s2_in.tlbMiss || + io.dcache.resp.bits.tag_error && GatedValidRegNext(io.csrCtrl.cache_error_enable) + ) } // soft prefetch will not trigger any exception (but ecc error interrupt may @@ -1091,11 +1167,11 @@ class LoadUnit(implicit p: Parameters) extends XSModule // writeback access fault caused by ecc error / bus error // * ecc data error is slow to generate, so we will not use it until load stage 3 // * in load stage 3, an extra signal io.load_error will be used to - val s2_actually_mmio = s2_pmp.mmio || Pbmt.isUncache(s2_pbmt) - val s2_mmio = !s2_prf && - s2_actually_mmio && - !s2_exception && - !s2_in.tlbMiss + // * if pbmt =/= 0, mmio is up to pbmt; otherwise, it's up to pmp + val s2_mmio = !s2_prf && + !s2_exception && !s2_in.tlbMiss && + Mux(Pbmt.isUncache(s2_pbmt), s2_in.mmio, s2_pmp.mmio) + val s2_uncache = !s2_prf && !s2_exception && !s2_in.tlbMiss && s2_actually_uncache val s2_full_fwd = Wire(Bool()) val s2_mem_amb = s2_in.uop.storeSetHit && @@ -1105,19 +1181,19 @@ class LoadUnit(implicit p: Parameters) extends XSModule val s2_fwd_fail = io.lsq.forward.dataInvalid && RegNext(io.lsq.forward.valid) val s2_dcache_miss = io.dcache.resp.bits.miss && !s2_fwd_frm_d_chan_or_mshr && - !s2_full_fwd + !s2_full_fwd && !s2_in.nc val s2_mq_nack = io.dcache.s2_mq_nack && !s2_fwd_frm_d_chan_or_mshr && - !s2_full_fwd + !s2_full_fwd && !s2_in.nc val s2_bank_conflict = io.dcache.s2_bank_conflict && !s2_fwd_frm_d_chan_or_mshr && - !s2_full_fwd + !s2_full_fwd && !s2_in.nc val s2_wpu_pred_fail = io.dcache.s2_wpu_pred_fail && !s2_fwd_frm_d_chan_or_mshr && - !s2_full_fwd + !s2_full_fwd && !s2_in.nc val s2_rar_nack = io.lsq.ldld_nuke_query.req.valid && !io.lsq.ldld_nuke_query.req.ready @@ -1143,13 +1219,15 @@ class LoadUnit(implicit p: Parameters) extends XSModule val s2_cache_handled = io.dcache.resp.bits.handled + //if it is NC with data, it should handle the replayed situation. + //else s2_uncache will enter uncache buffer. val s2_troublem = !s2_exception && - !s2_mmio && + (!s2_uncache || s2_nc_with_data) && !s2_prf && !s2_in.delayedLoadError io.dcache.resp.ready := true.B - val s2_dcache_should_resp = !(s2_in.tlbMiss || s2_exception || s2_in.delayedLoadError || s2_mmio || s2_prf) + val s2_dcache_should_resp = !(s2_in.tlbMiss || s2_exception || s2_in.delayedLoadError || s2_uncache || s2_prf) assert(!(s2_valid && (s2_dcache_should_resp && !io.dcache.resp.valid)), "DCache response got lost") // fast replay require @@ -1177,8 +1255,9 @@ class LoadUnit(implicit p: Parameters) extends XSModule val s2_data_fwded = s2_dcache_miss && s2_full_fwd - val s2_vp_match_fail = (io.lsq.forward.matchInvalid || io.sbuffer.matchInvalid) && s2_troublem - val s2_safe_wakeup = !s2_out.rep_info.need_rep && !s2_mmio && !s2_mis_align && !s2_exception // don't need to replay and is not a mmio and misalign + val s2_fwd_vp_match_invalid = io.lsq.forward.matchInvalid || io.sbuffer.matchInvalid || io.ubuffer.matchInvalid + val s2_vp_match_fail = s2_fwd_vp_match_invalid && s2_troublem + val s2_safe_wakeup = !s2_out.rep_info.need_rep && !s2_mmio && (!s2_in.nc || s2_nc_with_data) && !s2_mis_align && !s2_exception // don't need to replay and is not a mmio\misalign no data val s2_safe_writeback = s2_exception || s2_safe_wakeup || s2_vp_match_fail // ld-ld violation require @@ -1186,14 +1265,16 @@ class LoadUnit(implicit p: Parameters) extends XSModule io.lsq.ldld_nuke_query.req.bits.uop := s2_in.uop io.lsq.ldld_nuke_query.req.bits.mask := s2_in.mask io.lsq.ldld_nuke_query.req.bits.paddr := s2_in.paddr - io.lsq.ldld_nuke_query.req.bits.data_valid := Mux(s2_full_fwd || s2_fwd_data_valid, true.B, !s2_dcache_miss) + io.lsq.ldld_nuke_query.req.bits.data_valid := Mux(s2_full_fwd || s2_fwd_data_valid || s2_nc_with_data, true.B, !s2_dcache_miss) + io.lsq.ldld_nuke_query.req.bits.is_nc := s2_nc_with_data // st-ld violation require io.lsq.stld_nuke_query.req.valid := s2_valid && s2_can_query io.lsq.stld_nuke_query.req.bits.uop := s2_in.uop io.lsq.stld_nuke_query.req.bits.mask := s2_in.mask io.lsq.stld_nuke_query.req.bits.paddr := s2_in.paddr - io.lsq.stld_nuke_query.req.bits.data_valid := Mux(s2_full_fwd || s2_fwd_data_valid, true.B, !s2_dcache_miss) + io.lsq.stld_nuke_query.req.bits.data_valid := Mux(s2_full_fwd || s2_fwd_data_valid || s2_nc_with_data, true.B, !s2_dcache_miss) + io.lsq.stld_nuke_query.req.bits.is_nc := s2_nc_with_data // merge forward result // lsq has higher priority than sbuffer @@ -1202,8 +1283,11 @@ class LoadUnit(implicit p: Parameters) extends XSModule s2_full_fwd := ((~s2_fwd_mask.asUInt).asUInt & s2_in.mask) === 0.U && !io.lsq.forward.dataInvalid // generate XLEN/8 Muxs for (i <- 0 until VLEN / 8) { - s2_fwd_mask(i) := io.lsq.forward.forwardMask(i) || io.sbuffer.forwardMask(i) - s2_fwd_data(i) := Mux(io.lsq.forward.forwardMask(i), io.lsq.forward.forwardData(i), io.sbuffer.forwardData(i)) + s2_fwd_mask(i) := io.lsq.forward.forwardMask(i) || io.sbuffer.forwardMask(i) || io.ubuffer.forwardMask(i) + s2_fwd_data(i) := + Mux(io.lsq.forward.forwardMask(i), io.lsq.forward.forwardData(i), + Mux(s2_nc_with_data, io.ubuffer.forwardData(i), + io.sbuffer.forwardData(i))) } XSDebug(s2_fire, "[FWD LOAD RESP] pc %x fwd %x(%b) + %x(%b)\n", @@ -1214,8 +1298,8 @@ class LoadUnit(implicit p: Parameters) extends XSModule // s2_out := s2_in - s2_out.data := 0.U // data will be generated in load s3 s2_out.uop.fpWen := s2_in.uop.fpWen + s2_out.nc := s2_in.nc s2_out.mmio := s2_mmio s2_out.uop.flushPipe := false.B s2_out.uop.exceptionVec := s2_exception_vec @@ -1277,7 +1361,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule !s1_kill && !io.tlb.resp.bits.miss && !io.lsq.forward.dataInvalidFast - io.fast_uop.valid := GatedValidRegNext(s1_fast_uop_valid) && (s2_valid && !s2_out.rep_info.need_rep && !s2_mmio && !(s2_prf && !s2_hw_prf)) && !s2_isvec && !s2_frm_mabuf + io.fast_uop.valid := GatedValidRegNext(s1_fast_uop_valid) && (s2_valid && !s2_out.rep_info.need_rep && !s2_uncache && !(s2_prf && !s2_hw_prf)) && !s2_isvec && !s2_frm_mabuf io.fast_uop.bits := RegEnable(s1_out.uop, s1_fast_uop_valid) // @@ -1286,7 +1370,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule // RegNext prefetch train for better timing // ** Now, prefetch train is valid at load s3 ** val s2_prefetch_train_valid = WireInit(false.B) - s2_prefetch_train_valid := s2_valid && !s2_actually_mmio && (!s2_in.tlbMiss || s2_hw_prf) + s2_prefetch_train_valid := s2_valid && !s2_actually_uncache && (!s2_in.tlbMiss || s2_hw_prf) io.prefetch_train.valid := GatedValidRegNext(s2_prefetch_train_valid) io.prefetch_train.bits.fromLsPipelineBundle(s2_in, latch = true, enable = s2_prefetch_train_valid) io.prefetch_train.bits.miss := RegEnable(io.dcache.resp.bits.miss, s2_prefetch_train_valid) // TODO: use trace with bank conflict? @@ -1296,7 +1380,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule io.s2_prefetch_spec := s2_prefetch_train_valid val s2_prefetch_train_l1_valid = WireInit(false.B) - s2_prefetch_train_l1_valid := s2_valid && !s2_actually_mmio + s2_prefetch_train_l1_valid := s2_valid && !s2_actually_uncache io.prefetch_train_l1.valid := GatedValidRegNext(s2_prefetch_train_l1_valid) io.prefetch_train_l1.bits.fromLsPipelineBundle(s2_in, latch = true, enable = s2_prefetch_train_l1_valid) io.prefetch_train_l1.bits.miss := RegEnable(io.dcache.resp.bits.miss, s2_prefetch_train_l1_valid) @@ -1311,7 +1395,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule io.dcache.s1_pc := s1_out.uop.pc io.dcache.s2_pc := s2_out.uop.pc } - io.dcache.s2_kill := s2_pmp.ld || s2_actually_mmio || s2_kill + io.dcache.s2_kill := s2_pmp.ld || s2_actually_uncache || s2_kill val s1_ld_left_fire = s1_valid && !s1_kill && s2_ready val s2_ld_valid_dup = RegInit(0.U(6.W)) @@ -1331,6 +1415,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule val s3_dcache_rep = RegEnable(s2_dcache_fast_rep && s2_troublem, false.B, s2_fire) val s3_ld_valid_dup = RegEnable(s2_ld_valid_dup, s2_fire) val s3_fast_rep = Wire(Bool()) + val s3_nc_with_data = RegNext(s2_nc_with_data) val s3_troublem = GatedValidRegNext(s2_troublem) val s3_kill = s3_in.uop.robIdx.needFlush(io.redirect) val s3_vecout = Wire(new OnlyVecExuOutput) @@ -1369,7 +1454,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule io.fast_rep_out.valid := s3_valid && s3_fast_rep && !s3_in.uop.robIdx.needFlush(io.redirect) io.fast_rep_out.bits := s3_in - io.lsq.ldin.valid := s3_valid && (!s3_fast_rep || s3_fast_rep_canceled) && !s3_in.feedbacked && !s3_frm_mabuf + io.lsq.ldin.valid := s3_valid && (!s3_fast_rep || s3_fast_rep_canceled) && !s3_in.feedbacked && !s3_frm_mabuf && !s3_nc_with_data // TODO: check this --by hx // io.lsq.ldin.valid := s3_valid && (!s3_fast_rep || !io.fast_rep_out.ready) && !s3_in.feedbacked && !s3_in.lateKill io.lsq.ldin.bits := s3_in @@ -1388,7 +1473,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule io.lsq.ldin.bits.dcacheRequireReplay := s3_dcache_rep io.fast_rep_out.bits.delayedLoadError := s3_dly_ld_err - val s3_vp_match_fail = GatedValidRegNext(io.lsq.forward.matchInvalid || io.sbuffer.matchInvalid) && s3_troublem + val s3_vp_match_fail = GatedValidRegNext(s2_fwd_vp_match_invalid) && s3_troublem val s3_rep_frm_fetch = s3_vp_match_fail val s3_ldld_rep_inst = io.lsq.ldld_nuke_query.resp.valid && @@ -1415,6 +1500,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule s3_out.bits.data := s3_in.data s3_out.bits.isFromLoadUnit := true.B s3_out.bits.debug.isMMIO := s3_in.mmio + s3_out.bits.debug.isNC := s3_in.nc s3_out.bits.debug.isPerfCnt := false.B s3_out.bits.debug.paddr := s3_in.paddr s3_out.bits.debug.vaddr := s3_in.vaddr @@ -1481,36 +1567,42 @@ class LoadUnit(implicit p: Parameters) extends XSModule val s3_ld_wb_meta = Mux(s3_valid, s3_out.bits, s3_mmio.bits) // data from load queue refill - val s3_ld_raw_data_frm_uncache = RegNextN(io.lsq.ld_raw_data, 3) - val s3_merged_data_frm_uncache = s3_ld_raw_data_frm_uncache.mergedData() - val s3_picked_data_frm_uncache = LookupTree(s3_ld_raw_data_frm_uncache.addrOffset, List( - "b000".U -> s3_merged_data_frm_uncache(63, 0), - "b001".U -> s3_merged_data_frm_uncache(63, 8), - "b010".U -> s3_merged_data_frm_uncache(63, 16), - "b011".U -> s3_merged_data_frm_uncache(63, 24), - "b100".U -> s3_merged_data_frm_uncache(63, 32), - "b101".U -> s3_merged_data_frm_uncache(63, 40), - "b110".U -> s3_merged_data_frm_uncache(63, 48), - "b111".U -> s3_merged_data_frm_uncache(63, 56) + val s3_ld_raw_data_frm_mmio = RegNextN(io.lsq.ld_raw_data, 3) + val s3_merged_data_frm_mmio = s3_ld_raw_data_frm_mmio.mergedData() + val s3_picked_data_frm_mmio = LookupTree(s3_ld_raw_data_frm_mmio.addrOffset, List( + "b000".U -> s3_merged_data_frm_mmio(63, 0), + "b001".U -> s3_merged_data_frm_mmio(63, 8), + "b010".U -> s3_merged_data_frm_mmio(63, 16), + "b011".U -> s3_merged_data_frm_mmio(63, 24), + "b100".U -> s3_merged_data_frm_mmio(63, 32), + "b101".U -> s3_merged_data_frm_mmio(63, 40), + "b110".U -> s3_merged_data_frm_mmio(63, 48), + "b111".U -> s3_merged_data_frm_mmio(63, 56) )) - val s3_ld_data_frm_uncache = rdataHelper(s3_ld_raw_data_frm_uncache.uop, s3_picked_data_frm_uncache) - - // data from dcache hit - val s3_ld_raw_data_frm_cache = Wire(new LoadDataFromDcacheBundle) - s3_ld_raw_data_frm_cache.respDcacheData := io.dcache.resp.bits.data - s3_ld_raw_data_frm_cache.forward_D := s2_fwd_frm_d_chan - s3_ld_raw_data_frm_cache.forwardData_D := s2_fwd_data_frm_d_chan - s3_ld_raw_data_frm_cache.forward_mshr := s2_fwd_frm_mshr - s3_ld_raw_data_frm_cache.forwardData_mshr := s2_fwd_data_frm_mshr - s3_ld_raw_data_frm_cache.forward_result_valid := s2_fwd_data_valid - - s3_ld_raw_data_frm_cache.forwardMask := RegEnable(s2_fwd_mask, s2_valid) - s3_ld_raw_data_frm_cache.forwardData := RegEnable(s2_fwd_data, s2_valid) - s3_ld_raw_data_frm_cache.uop := RegEnable(s2_out.uop, s2_valid) - s3_ld_raw_data_frm_cache.addrOffset := RegEnable(s2_out.paddr(3, 0), s2_valid) - - val s3_merged_data_frm_tlD = RegEnable(s3_ld_raw_data_frm_cache.mergeTLData(), s2_valid) - val s3_merged_data_frm_cache = s3_ld_raw_data_frm_cache.mergeLsqFwdData(s3_merged_data_frm_tlD) + val s3_ld_data_frm_mmio = rdataHelper(s3_ld_raw_data_frm_mmio.uop, s3_picked_data_frm_mmio) + + /* data from pipe, which forward from respectively + * dcache hit: [D channel, mshr, sbuffer, sq] + * nc_with_data: [sq] + */ + + val s2_ld_data_frm_nc = shiftDataToHigh(s2_out.paddr, s2_out.data) + + val s3_ld_raw_data_frm_pipe = Wire(new LoadDataFromDcacheBundle) + s3_ld_raw_data_frm_pipe.respDcacheData := Mux(s2_nc_with_data, s2_ld_data_frm_nc, io.dcache.resp.bits.data) + s3_ld_raw_data_frm_pipe.forward_D := s2_fwd_frm_d_chan && !s2_nc_with_data + s3_ld_raw_data_frm_pipe.forwardData_D := s2_fwd_data_frm_d_chan + s3_ld_raw_data_frm_pipe.forward_mshr := s2_fwd_frm_mshr && !s2_nc_with_data + s3_ld_raw_data_frm_pipe.forwardData_mshr := s2_fwd_data_frm_mshr + s3_ld_raw_data_frm_pipe.forward_result_valid := s2_fwd_data_valid + + s3_ld_raw_data_frm_pipe.forwardMask := RegEnable(s2_fwd_mask, s2_valid) + s3_ld_raw_data_frm_pipe.forwardData := RegEnable(s2_fwd_data, s2_valid) + s3_ld_raw_data_frm_pipe.uop := RegEnable(s2_out.uop, s2_valid) + s3_ld_raw_data_frm_pipe.addrOffset := RegEnable(s2_out.paddr(3, 0), s2_valid) + + val s3_merged_data_frm_tlD = RegEnable(s3_ld_raw_data_frm_pipe.mergeTLData(), s2_valid) + val s3_merged_data_frm_pipe = s3_ld_raw_data_frm_pipe.mergeLsqFwdData(s3_merged_data_frm_tlD) // duplicate reg for ldout and vecldout private val LdDataDup = 3 @@ -1533,38 +1625,39 @@ class LoadUnit(implicit p: Parameters) extends XSModule val s3_merged_data_frm_tld_clip = VecInit(List.fill(LdDataDup)( RegEnable(Mux( s2_out.paddr(3), - s3_ld_raw_data_frm_cache.mergeTLData()(VLEN - 1, 64), - s3_ld_raw_data_frm_cache.mergeTLData()(63, 0) + s3_ld_raw_data_frm_pipe.mergeTLData()(VLEN - 1, 64), + s3_ld_raw_data_frm_pipe.mergeTLData()(63, 0) ).asTypeOf(Vec(XLEN / 8, UInt(8.W))), s2_valid) )) - val s3_merged_data_frm_cache_clip = VecInit((0 until LdDataDup).map(i => { + val s3_merged_data_frm_pipe_clip = VecInit((0 until LdDataDup).map(i => { VecInit((0 until XLEN / 8).map(j => Mux(s3_fwd_mask_clip(i)(j), s3_fwd_data_clip(i)(j), s3_merged_data_frm_tld_clip(i)(j)) )).asUInt })) - val s3_data_frm_cache = VecInit((0 until LdDataDup).map(i => { + val s3_data_frm_pipe = VecInit((0 until LdDataDup).map(i => { VecInit(Seq( - s3_merged_data_frm_cache_clip(i)(63, 0), - s3_merged_data_frm_cache_clip(i)(63, 8), - s3_merged_data_frm_cache_clip(i)(63, 16), - s3_merged_data_frm_cache_clip(i)(63, 24), - s3_merged_data_frm_cache_clip(i)(63, 32), - s3_merged_data_frm_cache_clip(i)(63, 40), - s3_merged_data_frm_cache_clip(i)(63, 48), - s3_merged_data_frm_cache_clip(i)(63, 56), + s3_merged_data_frm_pipe_clip(i)(63, 0), + s3_merged_data_frm_pipe_clip(i)(63, 8), + s3_merged_data_frm_pipe_clip(i)(63, 16), + s3_merged_data_frm_pipe_clip(i)(63, 24), + s3_merged_data_frm_pipe_clip(i)(63, 32), + s3_merged_data_frm_pipe_clip(i)(63, 40), + s3_merged_data_frm_pipe_clip(i)(63, 48), + s3_merged_data_frm_pipe_clip(i)(63, 56), )) })) - val s3_picked_data_frm_cache = VecInit((0 until LdDataDup).map(i => { - Mux1H(s3_data_select_by_offset, s3_data_frm_cache(i)) + val s3_picked_data_frm_pipe = VecInit((0 until LdDataDup).map(i => { + Mux1H(s3_data_select_by_offset, s3_data_frm_pipe(i)) })) - val s3_ld_data_frm_cache = newRdataHelper(s3_data_select, s3_picked_data_frm_cache(0)) + val s3_ld_data_frm_pipe = newRdataHelper(s3_data_select, s3_picked_data_frm_pipe(0)) // FIXME: add 1 cycle delay ? // io.lsq.uncache.ready := !s3_valid val s3_outexception = ExceptionNO.selectByFu(s3_out.bits.uop.exceptionVec, LduCfg).asUInt.orR && s3_vecActive io.ldout.bits := s3_ld_wb_meta - io.ldout.bits.data := Mux(s3_valid, s3_ld_data_frm_cache, s3_ld_data_frm_uncache) + io.ldout.bits.data := Mux(s3_valid, s3_ld_data_frm_pipe, s3_ld_data_frm_mmio) + io.ldout.valid := (s3_mmio.valid || (s3_out.valid && !s3_vecout.isvec && !s3_mis_align && !s3_frm_mabuf)) io.ldout.bits.uop.exceptionVec := ExceptionNO.selectByFu(s3_ld_wb_meta.uop.exceptionVec, LduCfg) @@ -1578,7 +1671,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule // TODO: check this --hx // io.ldout.valid := s3_out.valid && !s3_out.bits.uop.robIdx.needFlush(io.redirect) && !s3_vecout.isvec || // io.lsq.uncache.valid && !io.lsq.uncache.bits.uop.robIdx.needFlush(io.redirect) && !s3_out.valid && !io.lsq.uncache.bits.isVls - // io.ldout.bits.data := Mux(s3_out.valid, s3_ld_data_frm_cache, s3_ld_data_frm_uncache) + // io.ldout.bits.data := Mux(s3_out.valid, s3_ld_data_frm_pipe, s3_ld_data_frm_mmio) // io.ldout.valid := s3_out.valid && !s3_out.bits.uop.robIdx.needFlush(io.redirect) || // s3_mmio.valid && !s3_mmio.bits.uop.robIdx.needFlush(io.redirect) && !s3_out.valid @@ -1594,8 +1687,8 @@ class LoadUnit(implicit p: Parameters) extends XSModule // vec feedback io.vecldout.bits.vecFeedback := vecFeedback // TODO: VLSU, uncache data logic - val vecdata = rdataVecHelper(s3_vec_alignedType(1,0), s3_picked_data_frm_cache(1)) - io.vecldout.bits.vecdata.get := Mux(s3_in.is128bit, s3_merged_data_frm_cache, vecdata) + val vecdata = rdataVecHelper(s3_vec_alignedType(1,0), s3_picked_data_frm_pipe(1)) + io.vecldout.bits.vecdata.get := Mux(s3_in.is128bit, s3_merged_data_frm_pipe, vecdata) io.vecldout.bits.isvec := s3_vecout.isvec io.vecldout.bits.elemIdx := s3_vecout.elemIdx io.vecldout.bits.elemIdxInsideVd.get := s3_vecout.elemIdxInsideVd @@ -1615,6 +1708,7 @@ class LoadUnit(implicit p: Parameters) extends XSModule io.vecldout.bits.mmio := DontCare io.vecldout.bits.vstart := s3_vecout.vstart io.vecldout.bits.vecTriggerMask := s3_vecout.vecTriggerMask + io.vecldout.bits.nc := DontCare io.vecldout.valid := s3_out.valid && !s3_out.bits.uop.robIdx.needFlush(io.redirect) && s3_vecout.isvec //|| // TODO: check this, why !io.lsq.uncache.bits.isVls before? @@ -1624,12 +1718,12 @@ class LoadUnit(implicit p: Parameters) extends XSModule io.misalign_ldout.valid := s3_valid && (!s3_fast_rep || s3_fast_rep_canceled) && s3_frm_mabuf io.misalign_ldout.bits := io.lsq.ldin.bits - io.misalign_ldout.bits.data := Mux(s3_in.is128bit, s3_merged_data_frm_cache, s3_picked_data_frm_cache(2)) + io.misalign_ldout.bits.data := Mux(s3_in.is128bit, s3_merged_data_frm_pipe, s3_picked_data_frm_pipe(2)) // fast load to load forward if (EnableLoadToLoadForward) { - io.l2l_fwd_out.valid := s3_valid && !s3_in.mmio && !s3_rep_info.need_rep - io.l2l_fwd_out.data := Mux(s3_in.vaddr(3), s3_merged_data_frm_cache(127, 64), s3_merged_data_frm_cache(63, 0)) + io.l2l_fwd_out.valid := s3_valid && !s3_in.mmio && !s3_in.nc && !s3_rep_info.need_rep + io.l2l_fwd_out.data := Mux(s3_in.vaddr(3), s3_merged_data_frm_pipe(127, 64), s3_merged_data_frm_pipe(63, 0)) io.l2l_fwd_out.dly_ld_err := s3_dly_ld_err || // ecc delayed error s3_ldld_rep_inst || s3_rep_frm_fetch @@ -1729,6 +1823,16 @@ class LoadUnit(implicit p: Parameters) extends XSModule XSPerfAccumulate("load_to_load_forward_fail_addr_align", s1_cancel_ptr_chasing && !s1_ptr_chasing_canceled && !s1_not_fast_match && !s1_fu_op_type_not_ld && s1_addr_misaligned) XSPerfAccumulate("load_to_load_forward_fail_set_mismatch", s1_cancel_ptr_chasing && !s1_ptr_chasing_canceled && !s1_not_fast_match && !s1_fu_op_type_not_ld && !s1_addr_misaligned && s1_addr_mismatch) + XSPerfAccumulate("nc_ld_writeback", io.ldout.valid && s3_nc_with_data) + XSPerfAccumulate("nc_ld_exception", s3_valid && s3_nc_with_data && s3_in.uop.exceptionVec.reduce(_ || _)) + XSPerfAccumulate("nc_ldld_vio", s3_valid && s3_nc_with_data && s3_ldld_rep_inst) + XSPerfAccumulate("nc_stld_vio", s3_valid && s3_nc_with_data && s3_in.rep_info.nuke) + XSPerfAccumulate("nc_ldld_vioNack", s3_valid && s3_nc_with_data && s3_in.rep_info.rar_nack) + XSPerfAccumulate("nc_stld_vioNack", s3_valid && s3_nc_with_data && s3_in.rep_info.raw_nack) + XSPerfAccumulate("nc_stld_fwd", s3_valid && s3_nc_with_data && RegNext(s2_full_fwd)) + XSPerfAccumulate("nc_stld_fwdNotReady", s3_valid && s3_nc_with_data && RegNext(s2_mem_amb || s2_fwd_fail)) + XSPerfAccumulate("nc_stld_fwdAddrMismatch", s3_valid && s3_nc_with_data && s3_vp_match_fail) + // bug lyq: some signals in perfEvents are no longer suitable for the current MemBlock design // hardware performance counter val perfEvents = Seq( diff --git a/src/main/scala/xiangshan/mem/pipeline/StoreUnit.scala b/src/main/scala/xiangshan/mem/pipeline/StoreUnit.scala index 57c821eb09..f60970db0a 100644 --- a/src/main/scala/xiangshan/mem/pipeline/StoreUnit.scala +++ b/src/main/scala/xiangshan/mem/pipeline/StoreUnit.scala @@ -265,9 +265,9 @@ class StoreUnit(implicit p: Parameters) extends XSModule val s1_paddr = io.tlb.resp.bits.paddr(0) val s1_gpaddr = io.tlb.resp.bits.gpaddr(0) val s1_isForVSnonLeafPTE = io.tlb.resp.bits.isForVSnonLeafPTE - val s1_tlb_miss = io.tlb.resp.bits.miss + val s1_tlb_miss = io.tlb.resp.bits.miss && io.tlb.resp.valid && s1_valid val s1_mmio = s1_mmio_cbo - val s1_pbmt = io.tlb.resp.bits.pbmt(0) + val s1_pbmt = Mux(!s1_tlb_miss, io.tlb.resp.bits.pbmt.head, 0.U(Pbmt.width.W)) val s1_exception = ExceptionNO.selectByFu(s1_out.uop.exceptionVec, StaCfg).asUInt.orR val s1_isvec = RegEnable(s0_out.isvec, false.B, s0_fire) // val s1_isLastElem = RegEnable(s0_isLastElem, false.B, s0_fire) @@ -319,9 +319,10 @@ class StoreUnit(implicit p: Parameters) extends XSModule s1_out.vaNeedExt := s1_vaNeedExt s1_out.isHyper := s1_isHyper s1_out.miss := false.B - s1_out.mmio := s1_mmio + s1_out.nc := Pbmt.isNC(s1_pbmt) + s1_out.mmio := s1_mmio || Pbmt.isIO(s1_pbmt) s1_out.tlbMiss := s1_tlb_miss - s1_out.atomic := s1_mmio + s1_out.atomic := s1_mmio || Pbmt.isIO(s1_pbmt) s1_out.isForVSnonLeafPTE := s1_isForVSnonLeafPTE when (!s1_out.isvec && RegNext(io.tlb.req.bits.checkfullva) && (s1_out.uop.exceptionVec(storePageFault) || @@ -404,21 +405,23 @@ class StoreUnit(implicit p: Parameters) extends XSModule val s2_exception = RegNext(s1_feedback.bits.hit) && (s2_trigger_debug_mode || ExceptionNO.selectByFu(s2_out.uop.exceptionVec, StaCfg).asUInt.orR) - val s2_mmio = (s2_in.mmio || s2_pmp.mmio || Pbmt.isUncache(s2_pbmt)) && RegNext(s1_feedback.bits.hit) + val s2_mmio = (s2_in.mmio || (Pbmt.isPMA(s2_pbmt) && s2_pmp.mmio)) && RegNext(s1_feedback.bits.hit) + val s2_actually_uncache = (Pbmt.isPMA(s2_pbmt) && s2_pmp.mmio || s2_in.nc || s2_in.mmio) && RegNext(s1_feedback.bits.hit) + val s2_uncache = !s2_exception && !s2_in.tlbMiss && s2_actually_uncache s2_kill := ((s2_mmio && !s2_exception) && !s2_in.isvec) || s2_in.uop.robIdx.needFlush(io.redirect) s2_out := s2_in s2_out.af := s2_out.uop.exceptionVec(storeAccessFault) s2_out.mmio := s2_mmio && !s2_exception - s2_out.atomic := s2_in.atomic || s2_pmp.atomic + s2_out.atomic := s2_in.atomic || Pbmt.isPMA(s2_pbmt) && s2_pmp.atomic s2_out.uop.exceptionVec(storeAccessFault) := (s2_in.uop.exceptionVec(storeAccessFault) || s2_pmp.st || - (s2_in.isvec && s2_pmp.mmio && RegNext(s1_feedback.bits.hit)) + (s2_in.isvec && s2_actually_uncache && RegNext(s1_feedback.bits.hit)) ) && s2_vecActive s2_out.uop.vpu.vstart := s2_in.vecVaddrOffset >> s2_in.uop.vpu.veew // kill dcache write intent request when mmio or exception - io.dcache.s2_kill := (s2_mmio || s2_exception || s2_in.uop.robIdx.needFlush(io.redirect)) + io.dcache.s2_kill := (s2_uncache || s2_exception || s2_in.uop.robIdx.needFlush(io.redirect)) io.dcache.s2_pc := s2_out.uop.pc // TODO: dcache resp io.dcache.resp.ready := true.B @@ -453,7 +456,7 @@ class StoreUnit(implicit p: Parameters) extends XSModule // RegNext prefetch train for better timing // ** Now, prefetch train is valid at store s3 ** val s2_prefetch_train_valid = WireInit(false.B) - s2_prefetch_train_valid := s2_valid && io.dcache.resp.fire && !s2_out.mmio && !s2_in.tlbMiss && !s2_in.isHWPrefetch + s2_prefetch_train_valid := s2_valid && io.dcache.resp.fire && !s2_out.mmio && !s2_out.nc && !s2_in.tlbMiss && !s2_in.isHWPrefetch if(EnableStorePrefetchSMS) { io.s1_prefetch_spec := s1_fire io.s2_prefetch_spec := s2_prefetch_train_valid @@ -498,6 +501,7 @@ class StoreUnit(implicit p: Parameters) extends XSModule s3_out.uop := s3_in.uop s3_out.data := DontCare s3_out.debug.isMMIO := s3_in.mmio + s3_out.debug.isNC := s3_in.nc s3_out.debug.paddr := s3_in.paddr s3_out.debug.vaddr := s3_in.vaddr s3_out.debug.isPerfCnt := false.B @@ -520,6 +524,7 @@ class StoreUnit(implicit p: Parameters) extends XSModule sx_valid(i) := s3_valid sx_in(i).output := s3_out sx_in(i).vecFeedback := s3_vecFeedback + sx_in(i).nc := s3_in.nc sx_in(i).mmio := s3_in.mmio sx_in(i).usSecondInv := s3_in.usSecondInv sx_in(i).elemIdx := s3_in.elemIdx @@ -552,6 +557,7 @@ class StoreUnit(implicit p: Parameters) extends XSModule val sx_last_in_vec = sx_in_vec.takeRight(1).head sx_last_ready := !sx_last_valid || sx_last_in.output.uop.robIdx.needFlush(io.redirect) || io.stout.ready + // write back: normal store, nc store io.stout.valid := sx_last_valid && !sx_last_in.output.uop.robIdx.needFlush(io.redirect) && !sx_last_in_vec //isStore(sx_last_in.output.uop.fuType) io.stout.bits := sx_last_in.output io.stout.bits.uop.exceptionVec := ExceptionNO.selectByFu(sx_last_in.output.uop.exceptionVec, StaCfg) @@ -564,6 +570,7 @@ class StoreUnit(implicit p: Parameters) extends XSModule io.vecstout.bits.sourceType := RSFeedbackType.tlbMiss io.vecstout.bits.flushState := DontCare io.vecstout.bits.trigger := sx_last_in.output.uop.trigger + io.vecstout.bits.nc := sx_last_in.nc io.vecstout.bits.mmio := sx_last_in.mmio io.vecstout.bits.exceptionVec := ExceptionNO.selectByFu(sx_last_in.output.uop.exceptionVec, VstuCfg) io.vecstout.bits.usSecondInv := sx_last_in.usSecondInv diff --git a/src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala b/src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala index 002c6111a9..31dbabdf6b 100644 --- a/src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala +++ b/src/main/scala/xiangshan/mem/sbuffer/Sbuffer.scala @@ -303,7 +303,7 @@ class Sbuffer(implicit p: Parameters) // sbuffer_in_s1: // * read data and meta from fifo queue // * update sbuffer meta (vtag, ptag, flag) - // * prevert that line from being sent to dcache (add a block condition) + // * prevent that line from being sent to dcache (add a block condition) // * prepare cacheline level write enable signal, RegNext() data and mask // sbuffer_in_s2: diff --git a/src/main/scala/xiangshan/mem/vector/VecBundle.scala b/src/main/scala/xiangshan/mem/vector/VecBundle.scala index 3b24de1026..d491cbae55 100644 --- a/src/main/scala/xiangshan/mem/vector/VecBundle.scala +++ b/src/main/scala/xiangshan/mem/vector/VecBundle.scala @@ -111,6 +111,7 @@ class VecPipelineFeedbackIO(isVStore: Boolean=false) (implicit p: Parameters) ex val trigger = TriggerAction() //val dataInvalidSqIdx = new SqPtr //val paddr = UInt(PAddrBits.W) + val nc = Bool() val mmio = Bool() //val atomic = Bool() val exceptionVec = ExceptionVec() diff --git a/src/main/scala/xiangshan/mem/vector/VecCommon.scala b/src/main/scala/xiangshan/mem/vector/VecCommon.scala index 6477541710..e0dc3e3f7e 100644 --- a/src/main/scala/xiangshan/mem/vector/VecCommon.scala +++ b/src/main/scala/xiangshan/mem/vector/VecCommon.scala @@ -290,6 +290,7 @@ class VecFlowBundle(implicit p: Parameters) extends VLSUBundleWithMicroOp { class VecMemExuOutput(isVector: Boolean = false)(implicit p: Parameters) extends VLSUBundle{ val output = new MemExuOutput(isVector) val vecFeedback = Bool() + val nc = Bool() val mmio = Bool() val usSecondInv = Bool() val elemIdx = UInt(elemIdxBits.W) diff --git a/utility b/utility index 880e574d9f..80b00e017d 160000 --- a/utility +++ b/utility @@ -1 +1 @@ -Subproject commit 880e574d9fdc628d42651bc609962a0a30fe68bb +Subproject commit 80b00e017dd8dda4eaf491f5a5c21e35432efe8b