From dc4fac130426dbec49b49d778b9105d79b4a8eab Mon Sep 17 00:00:00 2001 From: CharlieLiu <67408162+bosscharlie@users.noreply.github.com> Date: Mon, 2 Dec 2024 14:13:38 +0800 Subject: [PATCH] feat(DCache): merge CMO requests into DCache TL-A Channel (#3968) * remove previous cmo datapath in memblock. * add datapath for cmo requests between lsq and dcache. * add new CMOUnit in MissQueue. * bump rocket-chip & coupledL2. --- coupledL2 | 2 +- rocket-chip | 2 +- src/main/scala/top/Configs.scala | 2 - src/main/scala/xiangshan/XSTile.scala | 12 --- .../scala/xiangshan/backend/MemBlock.scala | 20 +---- .../cache/dcache/DCacheWrapper.scala | 15 +++- .../cache/dcache/mainpipe/MissQueue.scala | 74 ++++++++++++++++++- .../xiangshan/mem/lsqueue/LSQWrapper.scala | 2 +- .../xiangshan/mem/lsqueue/StoreQueue.scala | 2 +- 9 files changed, 94 insertions(+), 37 deletions(-) diff --git a/coupledL2 b/coupledL2 index d66cd85aca..c4ce81f4c9 160000 --- a/coupledL2 +++ b/coupledL2 @@ -1 +1 @@ -Subproject commit d66cd85aca7164d35d409c230e2f48b857067687 +Subproject commit c4ce81f4c932ea9a238b3e6df0079a007c2ba1ba diff --git a/rocket-chip b/rocket-chip index 564f53e623..bb4baf85c5 160000 --- a/rocket-chip +++ b/rocket-chip @@ -1 +1 @@ -Subproject commit 564f53e623c7b0872472aab6d40c5c59fc719cf1 +Subproject commit bb4baf85c5bd4b55ffdcda12a75648fef212ab69 diff --git a/src/main/scala/top/Configs.scala b/src/main/scala/top/Configs.scala index 4fd63c70f1..a80e8a0db7 100644 --- a/src/main/scala/top/Configs.scala +++ b/src/main/scala/top/Configs.scala @@ -211,7 +211,6 @@ class MinimalConfig(n: Int = 1) extends Config( "dcache", isKeywordBitsOpt = p.dcacheParametersOpt.get.isKeywordBitsOpt )), - hasCMO = p.HasCMO && site(EnableCHI), )), L2NBanks = 2, prefetcher = None // if L2 pf_recv_node does not exist, disable SMS prefetcher @@ -299,7 +298,6 @@ class WithNKBL2 prefetch = Seq(BOPParameters()) ++ (if (tp) Seq(TPParameters()) else Nil) ++ (if (p.prefetcher.nonEmpty) Seq(PrefetchReceiverParams()) else Nil), - hasCMO = p.HasCMO && site(EnableCHI), enablePerf = !site(DebugOptionsKey).FPGAPlatform && site(DebugOptionsKey).EnablePerfDebug, enableRollingDB = site(DebugOptionsKey).EnableRollingDB, enableMonitor = site(DebugOptionsKey).AlwaysBasicDB, diff --git a/src/main/scala/xiangshan/XSTile.scala b/src/main/scala/xiangshan/XSTile.scala index 07776dd595..2b198db80e 100644 --- a/src/main/scala/xiangshan/XSTile.scala +++ b/src/main/scala/xiangshan/XSTile.scala @@ -79,18 +79,6 @@ class XSTile()(implicit p: Parameters) extends LazyModule case None => } - // CMO - l2top.inner.l2cache match { - case Some(l2) => - l2.cmo_sink_node.foreach(recv => { - recv := memBlock.cmo_sender.get - }) - l2.cmo_source_node.foreach(resp => { - memBlock.cmo_reciver.get := resp - }) - case None => - } - val core_l3_tpmeta_source_port = l2top.inner.l2cache match { case Some(l2) => l2.tpmeta_source_node case None => None diff --git a/src/main/scala/xiangshan/backend/MemBlock.scala b/src/main/scala/xiangshan/backend/MemBlock.scala index 1bbe0bd756..0dde7f92c8 100644 --- a/src/main/scala/xiangshan/backend/MemBlock.scala +++ b/src/main/scala/xiangshan/backend/MemBlock.scala @@ -24,7 +24,7 @@ import freechips.rocketchip.diplomacy.{BundleBridgeSource, LazyModule, LazyModul import freechips.rocketchip.interrupts.{IntSinkNode, IntSinkPortSimple} import freechips.rocketchip.tile.HasFPUParameters import freechips.rocketchip.tilelink._ -import coupledL2.{PrefetchRecv, CMOReq, CMOResp} +import coupledL2.{PrefetchRecv} import device.MsiInfoBundle import utils._ import utility._ @@ -248,8 +248,6 @@ class MemBlockInlined()(implicit p: Parameters) extends LazyModule val l3_pf_sender_opt = if (p(SoCParamsKey).L3CacheParamsOpt.nonEmpty) coreParams.prefetcher.map(_ => BundleBridgeSource(() => new huancun.PrefetchRecv) ) else None - val cmo_sender = if (HasCMO) Some(BundleBridgeSource(() => DecoupledIO(new CMOReq))) else None - val cmo_reciver = if (HasCMO) Some(BundleBridgeSink(Some(() => DecoupledIO(new CMOResp)))) else None val frontendBridge = LazyModule(new FrontendBridge) // interrupt sinks val clint_int_sink = IntSinkNode(IntSinkPortSimple(1, 2)) @@ -1099,20 +1097,8 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer) lsq.io.maControl <> storeMisalignBuffer.io.sqControl - // lsq to l2 CMO - outer.cmo_sender match { - case Some(x) => - x.out.head._1 <> lsq.io.cmoOpReq - case None => - lsq.io.cmoOpReq.ready := false.B - } - outer.cmo_reciver match { - case Some(x) => - x.in.head._1 <> lsq.io.cmoOpResp - case None => - lsq.io.cmoOpResp.valid := false.B - lsq.io.cmoOpResp.bits := 0.U.asTypeOf(new CMOResp) - } + lsq.io.cmoOpReq <> dcache.io.cmoOpReq + lsq.io.cmoOpResp <> dcache.io.cmoOpResp // Prefetcher val StreamDTLBPortIndex = TlbStartVec(dtlb_ld_idx) + LduCnt + HyuCnt diff --git a/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala b/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala index e821287a28..671cac29af 100644 --- a/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala +++ b/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala @@ -579,6 +579,15 @@ class AtomicWordIO(implicit p: Parameters) extends DCacheBundle val block_lr = Input(Bool()) } +class CMOReq(implicit p: Parameters) extends Bundle { + val opcode = UInt(3.W) // 0-cbo.clean, 1-cbo.flush, 2-cbo.inval, 3-cbo.zero + val address = UInt(64.W) +} + +class CMOResp(implicit p: Parameters) extends Bundle { + val address = UInt(64.W) +} + // used by load unit class DCacheLoadIO(implicit p: Parameters) extends DCacheWordIO { @@ -786,6 +795,8 @@ class DCacheIO(implicit p: Parameters) extends DCacheBundle { val debugTopDown = new DCacheTopDownIO val debugRolling = Flipped(new RobDebugRollingIO) val l2_hint = Input(Valid(new L2ToL1Hint())) + val cmoOpReq = Flipped(DecoupledIO(new CMOReq)) + val cmoOpResp = DecoupledIO(new CMOResp) } private object ArbiterCtrl { @@ -1425,6 +1436,8 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame missReqArb.io.out <> missQueue.io.req missReadyGen.io.queryMQ <> missQueue.io.queryMQ + io.cmoOpReq <> missQueue.io.cmo_req + io.cmoOpResp <> missQueue.io.cmo_resp for (w <- 0 until LoadPipelineWidth) { ldu(w).io.mq_enq_cancel := missQueue.io.mq_enq_cancel } @@ -1514,7 +1527,7 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame // in L1DCache, we ony expect Grant[Data] and ReleaseAck bus.d.ready := false.B - when (bus.d.bits.opcode === TLMessages.Grant || bus.d.bits.opcode === TLMessages.GrantData) { + when (bus.d.bits.opcode === TLMessages.Grant || bus.d.bits.opcode === TLMessages.GrantData || bus.d.bits.opcode === TLMessages.CBOAck) { missQueue.io.mem_grant <> bus.d } .elsewhen (bus.d.bits.opcode === TLMessages.ReleaseAck) { wb.io.mem_grant <> bus.d diff --git a/src/main/scala/xiangshan/cache/dcache/mainpipe/MissQueue.scala b/src/main/scala/xiangshan/cache/dcache/mainpipe/MissQueue.scala index 8dbbc67408..81c13bdf7c 100644 --- a/src/main/scala/xiangshan/cache/dcache/mainpipe/MissQueue.scala +++ b/src/main/scala/xiangshan/cache/dcache/mainpipe/MissQueue.scala @@ -31,6 +31,7 @@ import difftest._ import freechips.rocketchip.tilelink.ClientStates._ import freechips.rocketchip.tilelink.MemoryOpCategories._ import freechips.rocketchip.tilelink.TLPermissions._ +import freechips.rocketchip.tilelink.TLMessages._ import freechips.rocketchip.tilelink._ import huancun.{AliasKey, DirtyKey, PrefetchKey} import org.chipsalliance.cde.config.Parameters @@ -285,6 +286,63 @@ class MissReqPipeRegBundle(edge: TLEdgeOut)(implicit p: Parameters) extends DCac } } +class CMOUnit(edge: TLEdgeOut)(implicit p: Parameters) extends DCacheModule { + val io = IO(new Bundle() { + val req = Flipped(DecoupledIO(new CMOReq)) + val req_chanA = DecoupledIO(new TLBundleA(edge.bundle)) + val resp_chanD = Flipped(DecoupledIO(new TLBundleD(edge.bundle))) + val resp_to_lsq = DecoupledIO(new CMOResp) + }) + + val s_idle :: s_sreq :: s_wresp :: s_lsq_resp :: Nil = Enum(4) + val state = RegInit(s_idle) + val state_next = WireInit(state) + val req = RegEnable(io.req.bits, io.req.fire) + + state := state_next + + switch (state) { + is(s_idle) { + when (io.req.fire) { + state_next := s_sreq + } + } + is(s_sreq) { + when (io.req_chanA.fire) { + state_next := s_wresp + } + } + is(s_wresp) { + when (io.resp_chanD.fire) { + state_next := s_lsq_resp + } + } + is(s_lsq_resp) { + when (io.resp_to_lsq.fire) { + state_next := s_idle + } + } + } + + io.req.ready := state === s_idle + + io.req_chanA.valid := state === s_sreq + io.req_chanA.bits := edge.CacheBlockOperation( + fromSource = (cfg.nMissEntries + 1).U, + toAddress = req.address, + lgSize = (log2Up(cfg.blockBytes)).U, + opcode = req.opcode + )._2 + + io.resp_chanD.ready := state === s_wresp + + io.resp_to_lsq.valid := state === s_lsq_resp + io.resp_to_lsq.bits.address := req.address + + assert(!(state =/= s_idle && io.req.valid)) + assert(!(state =/= s_wresp && io.resp_chanD.valid)) +} + class MissEntry(edge: TLEdgeOut, reqNum: Int)(implicit p: Parameters) extends DCacheModule with HasCircularQueuePtrHelper { @@ -844,6 +902,10 @@ class MissQueue(edge: TLEdgeOut, reqNum: Int)(implicit p: Parameters) extends DC val resp = Output(new MissResp) val refill_to_ldq = ValidIO(new Refill) + // cmo req + val cmo_req = Flipped(DecoupledIO(new CMOReq)) + val cmo_resp = DecoupledIO(new CMOResp) + val queryMQ = Vec(reqNum, Flipped(new DCacheMQQueryIOBundle)) val mem_acquire = DecoupledIO(new TLBundleA(edge.bundle)) @@ -898,6 +960,7 @@ class MissQueue(edge: TLEdgeOut, reqNum: Int)(implicit p: Parameters) extends DC // 128KBL1: FIXME: provide vaddr for l2 val entries = Seq.fill(cfg.nMissEntries)(Module(new MissEntry(edge, reqNum))) + val cmo_unit = Module(new CMOUnit(edge)) val miss_req_pipe_reg = RegInit(0.U.asTypeOf(new MissReqPipeRegBundle(edge))) val acquire_from_pipereg = Wire(chiselTypeOf(io.mem_acquire)) @@ -1055,6 +1118,15 @@ class MissQueue(edge: TLEdgeOut, reqNum: Int)(implicit p: Parameters) extends DC } } + cmo_unit.io.req <> io.cmo_req + io.cmo_resp <> cmo_unit.io.resp_to_lsq + when (io.mem_grant.valid && io.mem_grant.bits.opcode === TLMessages.CBOAck) { + cmo_unit.io.resp_chanD <> io.mem_grant + } .otherwise { + cmo_unit.io.resp_chanD.valid := false.B + cmo_unit.io.resp_chanD.bits := DontCare + } + io.req.ready := accept io.mq_enq_cancel := io.req.bits.cancel io.refill_to_ldq.valid := Cat(entries.map(_.io.refill_to_ldq.valid)).orR @@ -1069,7 +1141,7 @@ class MissQueue(edge: TLEdgeOut, reqNum: Int)(implicit p: Parameters) extends DC XSPerfAccumulate("acquire_fire_from_pipereg", acquire_from_pipereg.fire) XSPerfAccumulate("pipereg_valid", miss_req_pipe_reg.reg_valid()) - val acquire_sources = Seq(acquire_from_pipereg) ++ entries.map(_.io.mem_acquire) + val acquire_sources = Seq(cmo_unit.io.req_chanA, acquire_from_pipereg) ++ entries.map(_.io.mem_acquire) TLArbiter.lowest(edge, io.mem_acquire, acquire_sources:_*) TLArbiter.lowest(edge, io.mem_finish, entries.map(_.io.mem_finish):_*) diff --git a/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala b/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala index d8bd550304..b9f59b29d5 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/LSQWrapper.scala @@ -25,11 +25,11 @@ import xiangshan._ import xiangshan.backend.Bundles.{DynInst, MemExuOutput} import xiangshan.cache._ import xiangshan.cache.{DCacheWordIO, DCacheLineIO, MemoryOpConstants} +import xiangshan.cache.{CMOReq, CMOResp} import xiangshan.cache.mmu.{TlbRequestIO, TlbHintIO} import xiangshan.mem._ import xiangshan.backend._ import xiangshan.backend.rob.RobLsqIO -import coupledL2.{CMOReq, CMOResp} import xiangshan.backend.fu.FuType class ExceptionAddrIO(implicit p: Parameters) extends XSBundle { diff --git a/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala b/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala index 59988259ec..f5437650de 100644 --- a/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala +++ b/src/main/scala/xiangshan/mem/lsqueue/StoreQueue.scala @@ -27,6 +27,7 @@ import utils._ import xiangshan._ import xiangshan.cache._ import xiangshan.cache.{DCacheLineIO, DCacheWordIO, MemoryOpConstants} +import xiangshan.cache.{CMOReq, CMOResp} import xiangshan.backend._ import xiangshan.backend.rob.{RobLsqIO, RobPtr} import xiangshan.backend.Bundles.{DynInst, MemExuOutput} @@ -34,7 +35,6 @@ import xiangshan.backend.decode.isa.bitfield.{Riscv32BitInst, XSInstBitFields} import xiangshan.backend.fu.FuConfig._ import xiangshan.backend.fu.FuType import xiangshan.ExceptionNO._ -import coupledL2.{CMOReq, CMOResp} class SqPtr(implicit p: Parameters) extends CircularQueuePtr[SqPtr]( p => p(XSCoreParamsKey).StoreQueueSize