From 874d0c34a1920eb734bb5c8001822e614be10e87 Mon Sep 17 00:00:00 2001 From: lihuijin <501296508@qq.com> Date: Mon, 11 Nov 2024 17:34:35 +0800 Subject: [PATCH] area(DCache): combined tag and meta --- .../cache/dcache/DCacheWrapper.scala | 39 ++++----- .../cache/dcache/mainpipe/MainPipe.scala | 5 +- .../cache/dcache/meta/TagArray.scala | 84 +++++++++++++------ 3 files changed, 78 insertions(+), 50 deletions(-) diff --git a/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala b/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala index 8f827dfd48..aeaf78fa08 100644 --- a/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala +++ b/src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala @@ -946,11 +946,10 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame //---------------------------------------- // core data structures val bankedDataArray = if(dwpuParam.enWPU) Module(new SramedDataArray) else Module(new BankedDataArray) - val metaArray = Module(new L1CohMetaArray(readPorts = LoadPipelineWidth + 1, writePorts = 1)) val errorArray = Module(new L1FlagMetaArray(readPorts = LoadPipelineWidth + 1, writePorts = 1)) val prefetchArray = Module(new L1PrefetchSourceArray(readPorts = PrefetchArrayReadPort, writePorts = 1 + LoadPipelineWidth)) // prefetch flag array val accessArray = Module(new L1FlagMetaArray(readPorts = AccessArrayReadPort, writePorts = LoadPipelineWidth + 1)) - val tagArray = Module(new DuplicatedTagArray(readPorts = TagReadPort)) + val tagArray = Module(new DuplicatedTagArray(readPorts = TagReadPort, writePorts=1)) val prefetcherMonitor = Module(new PrefetcherMonitor) val fdpMonitor = Module(new FDPrefetcherMonitor) val bloomFilter = Module(new BloomFilter(BLOOM_FILTER_ENTRY_NUM, true)) @@ -1033,19 +1032,15 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame // refillPipe.io.meta_write ) if(StorePrefetchL1Enabled) { - meta_read_ports.zip(metaArray.io.read).foreach { case (p, r) => r <> p } - meta_resp_ports.zip(metaArray.io.resp).foreach { case (p, r) => p := r } + meta_resp_ports.zip(tagArray.io.meta_resp).foreach { case (p, r) => p := r } } else { - (meta_read_ports.take(HybridLoadReadBase + 1) ++ - meta_read_ports.takeRight(backendParams.HyuCnt)).zip(metaArray.io.read).foreach { case (p, r) => r <> p } (meta_resp_ports.take(HybridLoadReadBase + 1) ++ - meta_resp_ports.takeRight(backendParams.HyuCnt)).zip(metaArray.io.resp).foreach { case (p, r) => p := r } + meta_resp_ports.takeRight(backendParams.HyuCnt)).zip(tagArray.io.meta_resp).foreach { case (p, r) => p := r } meta_read_ports.drop(HybridLoadReadBase + 1).take(HybridStoreReadBase).foreach { case p => p.ready := false.B } meta_resp_ports.drop(HybridLoadReadBase + 1).take(HybridStoreReadBase).foreach { case p => p := 0.U.asTypeOf(p) } } - meta_write_ports.zip(metaArray.io.write).foreach { case (p, w) => w <> p } - + meta_write_ports.zip(tagArray.io.meta_write).foreach { case (p, w) => w <> p } // read extra meta (exclude stu) (meta_read_ports.take(HybridLoadReadBase + 1) ++ meta_read_ports.takeRight(backendParams.HyuCnt)).zip(errorArray.io.read).foreach { case (p, r) => r <> p } @@ -1123,19 +1118,19 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame require(tagArray.io.read.size == (LoadPipelineWidth + 1)) } // val tag_write_intend = missQueue.io.refill_pipe_req.valid || mainPipe.io.tag_write_intend - val tag_write_intend = mainPipe.io.tag_write_intend - assert(!RegNext(!tag_write_intend && tagArray.io.write.valid)) + val tag_write_intend = mainPipe.io.tag_write_intend || mainPipe.io.meta_write.valid + assert(!RegNext(!tag_write_intend && tagArray.io.tag_write.valid)) ldu.take(HybridLoadReadBase).zipWithIndex.foreach { case (ld, i) => tagArray.io.read(i) <> ld.io.tag_read - ld.io.tag_resp := tagArray.io.resp(i) + ld.io.tag_resp := tagArray.io.tag_resp(i) ld.io.tag_read.ready := !tag_write_intend } if(StorePrefetchL1Enabled) { stu.take(HybridStoreReadBase).zipWithIndex.foreach { case (st, i) => tagArray.io.read(HybridLoadReadBase + i) <> st.io.tag_read - st.io.tag_resp := tagArray.io.resp(HybridLoadReadBase + i) + st.io.tag_resp := tagArray.io.tag_resp(HybridLoadReadBase + i) st.io.tag_read.ready := !tag_write_intend } }else { @@ -1172,11 +1167,11 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame } // tag resp - ldu(HybridLoadTagReadPort).io.tag_resp := tagArray.io.resp(TagReadPort) - stu(HybridStoreTagReadPort).io.tag_resp := tagArray.io.resp(TagReadPort) + ldu(HybridLoadTagReadPort).io.tag_resp := tagArray.io.tag_resp(TagReadPort) + stu(HybridStoreTagReadPort).io.tag_resp := tagArray.io.tag_resp(TagReadPort) } tagArray.io.read.last <> mainPipe.io.tag_read - mainPipe.io.tag_resp := tagArray.io.resp.last + mainPipe.io.tag_resp := tagArray.io.tag_resp.last val fake_tag_read_conflict_this_cycle = PopCount(ldu.map(ld=> ld.io.tag_read.valid)) XSPerfAccumulate("fake_tag_read_conflict", fake_tag_read_conflict_this_cycle) @@ -1184,11 +1179,11 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame val tag_write_arb = Module(new Arbiter(new TagWriteReq, 1)) // tag_write_arb.io.in(0) <> refillPipe.io.tag_write tag_write_arb.io.in(0) <> mainPipe.io.tag_write - tagArray.io.write <> tag_write_arb.io.out + tagArray.io.tag_write <> tag_write_arb.io.out ldu.map(m => { - m.io.vtag_update.valid := tagArray.io.write.valid - m.io.vtag_update.bits := tagArray.io.write.bits + m.io.vtag_update.valid := tagArray.io.tag_write.valid + m.io.vtag_update.bits := tagArray.io.tag_write.bits }) //---------------------------------------- @@ -1253,9 +1248,9 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame dwpu.io.lookup_upd(i) <> ldu(i).io.dwpu.lookup_upd(0) dwpu.io.cfpred(i) <> ldu(i).io.dwpu.cfpred(0) } - dwpu.io.tagwrite_upd.valid := tagArray.io.write.valid - dwpu.io.tagwrite_upd.bits.vaddr := tagArray.io.write.bits.vaddr - dwpu.io.tagwrite_upd.bits.s1_real_way_en := tagArray.io.write.bits.way_en + dwpu.io.tagwrite_upd.valid := tagArray.io.tag_write.valid + dwpu.io.tagwrite_upd.bits.vaddr := tagArray.io.tag_write.bits.vaddr + dwpu.io.tagwrite_upd.bits.s1_real_way_en := tagArray.io.tag_write.bits.way_en } else { for(i <- 0 until LoadPipelineWidth){ ldu(i).io.dwpu.req(0).ready := true.B diff --git a/src/main/scala/xiangshan/cache/dcache/mainpipe/MainPipe.scala b/src/main/scala/xiangshan/cache/dcache/mainpipe/MainPipe.scala index f9b96f138c..ebcf6089a5 100644 --- a/src/main/scala/xiangshan/cache/dcache/mainpipe/MainPipe.scala +++ b/src/main/scala/xiangshan/cache/dcache/mainpipe/MainPipe.scala @@ -1422,9 +1422,10 @@ class MainPipe(implicit p: Parameters) extends DCacheModule with HasPerfEvents w io.meta_read.bits.idx := get_idx(s0_req.vaddr) io.meta_read.bits.way_en := Mux(s0_req.replace, s0_req.replace_way_en, ~0.U(nWays.W)) - io.tag_read.valid := req.valid && !set_conflict && !s0_req.replace + io.tag_read.valid := req.valid && !set_conflict + io.tag_read.valid := req.valid && s1_ready && !set_conflict io.tag_read.bits.idx := get_idx(s0_req.vaddr) - io.tag_read.bits.way_en := ~0.U(nWays.W) + io.tag_read.bits.way_en := Mux(s0_req.replace, s0_req.replace_way_en, ~0.U(nWays.W)) io.data_read_intend := s1_valid_dup(3) && s1_need_data io.data_readline.valid := s1_valid_dup(4) && s1_need_data diff --git a/src/main/scala/xiangshan/cache/dcache/meta/TagArray.scala b/src/main/scala/xiangshan/cache/dcache/meta/TagArray.scala index 1853c0f61b..ff13b4cc51 100644 --- a/src/main/scala/xiangshan/cache/dcache/meta/TagArray.scala +++ b/src/main/scala/xiangshan/cache/dcache/meta/TagArray.scala @@ -21,6 +21,9 @@ import chisel3._ import chisel3.util._ import utility.{SRAMTemplate, XSPerfAccumulate, ClockGate} import xiangshan.cache.CacheInstrucion._ +import xiangshan.cache.Meta +import freechips.rocketchip._ +import freechips.rocketchip.tilelink.ClientMetadata class TagReadReq(implicit p: Parameters) extends DCacheBundle { val idx = UInt(idxBits.W) @@ -45,27 +48,39 @@ abstract class AbstractTagArray(implicit p: Parameters) extends DCacheModule { class TagArray(implicit p: Parameters) extends AbstractTagArray { val io = IO(new Bundle() { val read = Flipped(DecoupledIO(new TagReadReq)) - val resp = Output(Vec(nWays, UInt(tagBits.W))) - val write = Flipped(DecoupledIO(new TagWriteReq)) + val tag_resp = Output(Vec(nWays, UInt(tagBits.W))) + val meta_resp = Output(Vec(nWays, new Meta)) + val tag_write = Flipped(DecoupledIO(new TagWriteReq)) + val meta_write = Flipped(DecoupledIO(new CohMetaWriteReq)) // ecc val ecc_read = Flipped(DecoupledIO(new TagReadReq)) val ecc_resp = Output(Vec(nWays, UInt(eccTagBits.W))) val ecc_write = Flipped(DecoupledIO(new TagEccWriteReq)) }) + def metaBits = 2 //freechips.rocketchip.tilelink.ClientMetadata.width // TODO: reset is unnecessary? val rst_cnt = RegInit(0.U(log2Up(nSets + 1).W)) val rst = rst_cnt < nSets.U val rstVal = 0.U - val waddr = Mux(rst, rst_cnt, io.write.bits.idx) - val wdata = Mux(rst, rstVal, io.write.bits.tag) - val wmask = Mux(rst || (nWays == 1).B, (-1).asSInt, io.write.bits.way_en.asSInt).asBools + val write_idx = WireInit(0.U) + val write_way_en = WireInit(0.U(nWays.W)) + when(io.tag_write.valid) { + write_idx := io.tag_write.bits.idx + write_way_en := io.tag_write.bits.way_en + }.elsewhen(io.meta_write.valid) { + write_idx := io.meta_write.bits.idx + write_way_en := io.meta_write.bits.way_en + } + val waddr = Mux(rst, rst_cnt, write_idx) + val wdata = Mux(rst, rstVal, Cat(io.meta_write.bits.meta.coh.asUInt, io.tag_write.bits.tag)) + val wmask = Mux(rst || (nWays == 1).B, (-1).asSInt, write_way_en.asSInt).asBools val rmask = Mux(rst || (nWays == 1).B, (-1).asSInt, io.read.bits.way_en.asSInt).asBools when (rst) { rst_cnt := rst_cnt + 1.U } - val tag_array = Module(new SRAMTemplate(UInt(tagBits.W), set = nSets, way = nWays, - shouldReset = false, holdRead = false, singlePort = true)) + val tag_array = Module(new SRAMTemplate(UInt((metaBits + tagBits).W), set = nSets, way = nWays, + shouldReset = true, holdRead = false, singlePort = true, useBitmask = true)) val ecc_array = TagEccParam.map { case _ => @@ -74,11 +89,20 @@ class TagArray(implicit p: Parameters) extends AbstractTagArray { ecc } - val wen = rst || io.write.valid + val wen = rst || io.tag_write.valid || io.meta_write.valid + val write_bitmask = WireInit(0.U((metaBits + tagBits).W)) + when(io.meta_write.valid && io.tag_write.valid) { + write_bitmask := Fill(metaBits + tagBits, 1.U(1.W)) + }.elsewhen(io.meta_write.valid && !io.tag_write.valid) { + write_bitmask := Cat(Fill(metaBits, 1.U(1.W)), Fill(tagBits, 0.U(1.W))) + }.elsewhen(!io.meta_write.valid && io.tag_write.valid) { + write_bitmask := Cat(Fill(metaBits, 0.U(1.W)), Fill(tagBits, 1.U(1.W))) + } tag_array.io.w.req.valid := wen tag_array.io.w.req.bits.apply( setIdx = waddr, data = wdata, + bitmask = write_bitmask, waymask = VecInit(wmask).asUInt ) @@ -103,7 +127,8 @@ class TagArray(implicit p: Parameters) extends AbstractTagArray { tag_array.io.r.req.valid := ren tag_array.io.r.req.bits.apply(setIdx = io.read.bits.idx) tag_array.clock := ClockGate(false.B, ren | wen, clock) - io.resp := tag_array.io.r.resp.data + io.tag_resp := tag_array.io.r.resp.data.map(r => r(tagBits - 1, 0)) + io.meta_resp := VecInit(tag_array.io.r.resp.data.map(r => r(metaBits + tagBits - 1, tagBits).asTypeOf(new Meta))) XSPerfAccumulate("part_tag_read_counter", tag_array.io.r.req.valid) val ecc_ren = io.ecc_read.fire @@ -117,7 +142,8 @@ class TagArray(implicit p: Parameters) extends AbstractTagArray { io.ecc_resp := 0.U.asTypeOf(io.ecc_resp) } - io.write.ready := !rst + io.tag_write.ready := !rst + io.meta_write.ready := !rst io.read.ready := !wen ecc_array match { case Some(ecc) => @@ -129,11 +155,13 @@ class TagArray(implicit p: Parameters) extends AbstractTagArray { } } -class DuplicatedTagArray(readPorts: Int)(implicit p: Parameters) extends AbstractTagArray { +class DuplicatedTagArray(readPorts: Int, writePorts: Int)(implicit p: Parameters) extends AbstractTagArray { val io = IO(new Bundle() { val read = Vec(readPorts, Flipped(DecoupledIO(new TagReadReq))) - val resp = Output(Vec(readPorts, Vec(nWays, UInt(encTagBits.W)))) - val write = Flipped(DecoupledIO(new TagWriteReq)) + val tag_resp = Output(Vec(readPorts, Vec(nWays, UInt(encTagBits.W)))) + val meta_resp = Output(Vec(readPorts, Vec(nWays, new Meta))) + val tag_write = Flipped(DecoupledIO(new TagWriteReq)) + val meta_write = Vec(writePorts, Flipped(DecoupledIO(new CohMetaWriteReq))) // customized cache op port val cacheOp = Flipped(new L1CacheInnerOpIO) val cacheOp_req_dup = Vec(DCacheDupNum, Flipped(Valid(new CacheCtrlReqInfo))) @@ -150,18 +178,21 @@ class DuplicatedTagArray(readPorts: Int)(implicit p: Parameters) extends Abstrac val tag_read_oh = WireInit(VecInit(Seq.fill(readPorts)(0.U(XLEN.W)))) for (i <- 0 until readPorts) { // normal read / write - array(i).io.write.valid := io.write.valid - array(i).io.write.bits := io.write.bits - array(i).io.ecc_write.valid := io.write.valid - array(i).io.ecc_write.bits.idx := io.write.bits.idx - array(i).io.ecc_write.bits.way_en := io.write.bits.way_en - val ecc = getECCFromEncTag(cacheParams.tagCode.encode(io.write.bits.tag)) + array(i).io.tag_write.valid := io.tag_write.valid + array(i).io.tag_write.bits := io.tag_write.bits + array(i).io.meta_write.valid := io.meta_write(0).valid + array(i).io.meta_write.bits := io.meta_write(0).bits + array(i).io.ecc_write.valid := io.tag_write.valid + array(i).io.ecc_write.bits.idx := io.tag_write.bits.idx + array(i).io.ecc_write.bits.way_en := io.tag_write.bits.way_en + val ecc = getECCFromEncTag(cacheParams.tagCode.encode(io.tag_write.bits.tag)) array(i).io.ecc_write.bits.ecc := ecc array(i).io.read <> io.read(i) array(i).io.ecc_read.valid := io.read(i).valid array(i).io.ecc_read.bits := io.read(i).bits - io.resp(i) := (array(i).io.ecc_resp zip array(i).io.resp).map { case (e, r) => Cat(e, r) } + io.tag_resp(i) := (array(i).io.ecc_resp zip array(i).io.tag_resp).map { case (e, r) => Cat(e, r) } + io.meta_resp(i) := array(i).io.meta_resp // extra ports for cache op // array(i).io.ecc_write.valid := false.B // array(i).io.ecc_write.bits := DontCare @@ -169,7 +200,8 @@ class DuplicatedTagArray(readPorts: Int)(implicit p: Parameters) extends Abstrac tag_read_oh(i) := PopCount(array(i).io.read.fire) } XSPerfAccumulate("tag_read_counter", tag_read_oh.reduce(_ + _)) - io.write.ready := true.B + io.tag_write.ready := true.B + io.meta_write(0).ready := true.B require(nWays <= 32) io.cacheOp.resp.bits := DontCare @@ -198,10 +230,10 @@ class DuplicatedTagArray(readPorts: Int)(implicit p: Parameters) extends Abstrac } wdata_dup_vec.zipWithIndex.map{ case(dupIdx, idx) => when(io.cacheOp_req_dup(dupIdx).valid && isWriteTag(io.cacheOp_req_bits_opCode_dup(dupIdx))) { - array(idx).io.write.valid := true.B - array(idx).io.write.bits.idx := io.cacheOp.req.bits.index - array(idx).io.write.bits.way_en := UIntToOH(io.cacheOp.req.bits.wayNum(4, 0)) - array(idx).io.write.bits.tag := io.cacheOp.req.bits.write_tag_low + array(idx).io.tag_write.valid := true.B + array(idx).io.tag_write.bits.idx := io.cacheOp.req.bits.index + array(idx).io.tag_write.bits.way_en := UIntToOH(io.cacheOp.req.bits.wayNum(4, 0)) + array(idx).io.tag_write.bits.tag := io.cacheOp.req.bits.write_tag_low cacheOpShouldResp := true.B } } @@ -216,6 +248,6 @@ class DuplicatedTagArray(readPorts: Int)(implicit p: Parameters) extends Abstrac } io.cacheOp.resp.valid := RegNext(io.cacheOp.req.valid && cacheOpShouldResp) - io.cacheOp.resp.bits.read_tag_low := Mux(io.cacheOp.resp.valid, array(0).io.resp(RegNext(io.cacheOp.req.bits.wayNum)), 0.U) + io.cacheOp.resp.bits.read_tag_low := Mux(io.cacheOp.resp.valid, array(0).io.tag_resp(RegNext(io.cacheOp.req.bits.wayNum)), 0.U) io.cacheOp.resp.bits.read_tag_ecc := Mux(io.cacheOp.resp.valid, array(0).io.ecc_resp(RegNext(io.cacheOp.req.bits.wayNum)), 0.U) }