Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

area(DCache): combine tag and meta #3863

Draft
wants to merge 2 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 18 additions & 21 deletions src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala
Original file line number Diff line number Diff line change
Expand Up @@ -946,7 +946,6 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
//----------------------------------------
// core data structures
val bankedDataArray = if(dwpuParam.enWPU) Module(new SramedDataArray) else Module(new BankedDataArray)
val metaArray = Module(new L1CohMetaArray(readPorts = LoadPipelineWidth + 1, writePorts = 1))
val errorArray = Module(new L1FlagMetaArray(readPorts = LoadPipelineWidth + 1, writePorts = 1))
val prefetchArray = Module(new L1PrefetchSourceArray(readPorts = PrefetchArrayReadPort, writePorts = 1 + LoadPipelineWidth)) // prefetch flag array
val accessArray = Module(new L1FlagMetaArray(readPorts = AccessArrayReadPort, writePorts = LoadPipelineWidth + 1))
Expand Down Expand Up @@ -1033,19 +1032,17 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
// refillPipe.io.meta_write
)
if(StorePrefetchL1Enabled) {
meta_read_ports.zip(metaArray.io.read).foreach { case (p, r) => r <> p }
meta_resp_ports.zip(metaArray.io.resp).foreach { case (p, r) => p := r }
meta_resp_ports.zipWithIndex.foreach({ case (meta_resp, i) =>
meta_resp := tagArray.io.resp(i).meta })
} else {
(meta_read_ports.take(HybridLoadReadBase + 1) ++
meta_read_ports.takeRight(backendParams.HyuCnt)).zip(metaArray.io.read).foreach { case (p, r) => r <> p }
(meta_resp_ports.take(HybridLoadReadBase + 1) ++
meta_resp_ports.takeRight(backendParams.HyuCnt)).zip(metaArray.io.resp).foreach { case (p, r) => p := r }
meta_resp_ports.takeRight(backendParams.HyuCnt)).zipWithIndex.foreach({ case (meta_resp, i) =>
meta_resp := tagArray.io.resp(i).meta })

meta_read_ports.drop(HybridLoadReadBase + 1).take(HybridStoreReadBase).foreach { case p => p.ready := false.B }
meta_resp_ports.drop(HybridLoadReadBase + 1).take(HybridStoreReadBase).foreach { case p => p := 0.U.asTypeOf(p) }
}
meta_write_ports.zip(metaArray.io.write).foreach { case (p, w) => w <> p }

mainPipe.io.meta_write <> tagArray.io.meta_write
// read extra meta (exclude stu)
(meta_read_ports.take(HybridLoadReadBase + 1) ++
meta_read_ports.takeRight(backendParams.HyuCnt)).zip(errorArray.io.read).foreach { case (p, r) => r <> p }
Expand Down Expand Up @@ -1123,19 +1120,19 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
require(tagArray.io.read.size == (LoadPipelineWidth + 1))
}
// val tag_write_intend = missQueue.io.refill_pipe_req.valid || mainPipe.io.tag_write_intend
val tag_write_intend = mainPipe.io.tag_write_intend
assert(!RegNext(!tag_write_intend && tagArray.io.write.valid))
val tag_write_intend = mainPipe.io.tag_write_intend || mainPipe.io.meta_write.valid
assert(!RegNext(!tag_write_intend && tagArray.io.tag_write.valid))
ldu.take(HybridLoadReadBase).zipWithIndex.foreach {
case (ld, i) =>
tagArray.io.read(i) <> ld.io.tag_read
ld.io.tag_resp := tagArray.io.resp(i)
ld.io.tag_resp := tagArray.io.resp(i).tag
ld.io.tag_read.ready := !tag_write_intend
}
if(StorePrefetchL1Enabled) {
stu.take(HybridStoreReadBase).zipWithIndex.foreach {
case (st, i) =>
tagArray.io.read(HybridLoadReadBase + i) <> st.io.tag_read
st.io.tag_resp := tagArray.io.resp(HybridLoadReadBase + i)
st.io.tag_resp := tagArray.io.resp(HybridLoadReadBase + i).tag
st.io.tag_read.ready := !tag_write_intend
}
}else {
Expand Down Expand Up @@ -1172,23 +1169,23 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
}

// tag resp
ldu(HybridLoadTagReadPort).io.tag_resp := tagArray.io.resp(TagReadPort)
stu(HybridStoreTagReadPort).io.tag_resp := tagArray.io.resp(TagReadPort)
ldu(HybridLoadTagReadPort).io.tag_resp := tagArray.io.resp(TagReadPort).tag
stu(HybridStoreTagReadPort).io.tag_resp := tagArray.io.resp(TagReadPort).tag
}
tagArray.io.read.last <> mainPipe.io.tag_read
mainPipe.io.tag_resp := tagArray.io.resp.last
mainPipe.io.tag_resp := tagArray.io.resp.last.tag

val fake_tag_read_conflict_this_cycle = PopCount(ldu.map(ld=> ld.io.tag_read.valid))
XSPerfAccumulate("fake_tag_read_conflict", fake_tag_read_conflict_this_cycle)

val tag_write_arb = Module(new Arbiter(new TagWriteReq, 1))
// tag_write_arb.io.in(0) <> refillPipe.io.tag_write
tag_write_arb.io.in(0) <> mainPipe.io.tag_write
tagArray.io.write <> tag_write_arb.io.out
tagArray.io.tag_write <> tag_write_arb.io.out

ldu.map(m => {
m.io.vtag_update.valid := tagArray.io.write.valid
m.io.vtag_update.bits := tagArray.io.write.bits
m.io.vtag_update.valid := tagArray.io.tag_write.valid
m.io.vtag_update.bits := tagArray.io.tag_write.bits
})

//----------------------------------------
Expand Down Expand Up @@ -1253,9 +1250,9 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
dwpu.io.lookup_upd(i) <> ldu(i).io.dwpu.lookup_upd(0)
dwpu.io.cfpred(i) <> ldu(i).io.dwpu.cfpred(0)
}
dwpu.io.tagwrite_upd.valid := tagArray.io.write.valid
dwpu.io.tagwrite_upd.bits.vaddr := tagArray.io.write.bits.vaddr
dwpu.io.tagwrite_upd.bits.s1_real_way_en := tagArray.io.write.bits.way_en
dwpu.io.tagwrite_upd.valid := tagArray.io.tag_write.valid
dwpu.io.tagwrite_upd.bits.vaddr := tagArray.io.tag_write.bits.vaddr
dwpu.io.tagwrite_upd.bits.s1_real_way_en := tagArray.io.tag_write.bits.way_en
} else {
for(i <- 0 until LoadPipelineWidth){
ldu(i).io.dwpu.req(0).ready := true.B
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1422,9 +1422,10 @@ class MainPipe(implicit p: Parameters) extends DCacheModule with HasPerfEvents w
io.meta_read.bits.idx := get_idx(s0_req.vaddr)
io.meta_read.bits.way_en := Mux(s0_req.replace, s0_req.replace_way_en, ~0.U(nWays.W))

io.tag_read.valid := req.valid && !set_conflict && !s0_req.replace
io.tag_read.valid := req.valid && !set_conflict
io.tag_read.valid := req.valid && s1_ready && !set_conflict
io.tag_read.bits.idx := get_idx(s0_req.vaddr)
io.tag_read.bits.way_en := ~0.U(nWays.W)
io.tag_read.bits.way_en := Mux(s0_req.replace, s0_req.replace_way_en, ~0.U(nWays.W))

io.data_read_intend := s1_valid_dup(3) && s1_need_data
io.data_readline.valid := s1_valid_dup(4) && s1_need_data
Expand Down
85 changes: 60 additions & 25 deletions src/main/scala/xiangshan/cache/dcache/meta/TagArray.scala
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ import chisel3._
import chisel3.util._
import utility.{SRAMTemplate, XSPerfAccumulate, ClockGate}
import xiangshan.cache.CacheInstrucion._
import xiangshan.cache.Meta
import freechips.rocketchip._
import freechips.rocketchip.tilelink.ClientMetadata

class TagReadReq(implicit p: Parameters) extends DCacheBundle {
val idx = UInt(idxBits.W)
Expand All @@ -36,6 +39,11 @@ class TagEccWriteReq(implicit p: Parameters) extends TagReadReq {
val ecc = UInt(eccTagBits.W)
}

class TagMetaResp(implicit p: Parameters) extends DCacheBundle {
val tag = Vec(nWays, UInt(tagBits.W))
val meta = Vec(nWays, new Meta)
}

case object HasTagEccParam

abstract class AbstractTagArray(implicit p: Parameters) extends DCacheModule {
Expand All @@ -45,27 +53,38 @@ abstract class AbstractTagArray(implicit p: Parameters) extends DCacheModule {
class TagArray(implicit p: Parameters) extends AbstractTagArray {
val io = IO(new Bundle() {
val read = Flipped(DecoupledIO(new TagReadReq))
val resp = Output(Vec(nWays, UInt(tagBits.W)))
val write = Flipped(DecoupledIO(new TagWriteReq))
val resp = Output(new TagMetaResp)
val tag_write = Flipped(DecoupledIO(new TagWriteReq))
val meta_write = Flipped(DecoupledIO(new CohMetaWriteReq))
// ecc
val ecc_read = Flipped(DecoupledIO(new TagReadReq))
val ecc_resp = Output(Vec(nWays, UInt(eccTagBits.W)))
val ecc_write = Flipped(DecoupledIO(new TagEccWriteReq))
})
def metaBits = 2 //freechips.rocketchip.tilelink.ClientMetadata.width
// TODO: reset is unnecessary?
val rst_cnt = RegInit(0.U(log2Up(nSets + 1).W))
val rst = rst_cnt < nSets.U
val rstVal = 0.U
val waddr = Mux(rst, rst_cnt, io.write.bits.idx)
val wdata = Mux(rst, rstVal, io.write.bits.tag)
val wmask = Mux(rst || (nWays == 1).B, (-1).asSInt, io.write.bits.way_en.asSInt).asBools
val write_idx = WireInit(0.U)
val write_way_en = WireInit(0.U(nWays.W))
when(io.tag_write.valid) {
write_idx := io.tag_write.bits.idx
write_way_en := io.tag_write.bits.way_en
}.elsewhen(io.meta_write.valid) {
write_idx := io.meta_write.bits.idx
write_way_en := io.meta_write.bits.way_en
}
val waddr = Mux(rst, rst_cnt, write_idx)
val wdata = Mux(rst, rstVal, Cat(io.meta_write.bits.meta.coh.asUInt, io.tag_write.bits.tag))
val wmask = Mux(rst || (nWays == 1).B, (-1).asSInt, write_way_en.asSInt).asBools
val rmask = Mux(rst || (nWays == 1).B, (-1).asSInt, io.read.bits.way_en.asSInt).asBools
when (rst) {
rst_cnt := rst_cnt + 1.U
}

val tag_array = Module(new SRAMTemplate(UInt(tagBits.W), set = nSets, way = nWays,
shouldReset = false, holdRead = false, singlePort = true))
val tag_array = Module(new SRAMTemplate(UInt((metaBits + tagBits).W), set = nSets, way = nWays,
shouldReset = true, holdRead = false, singlePort = true, useBitmask = true))

val ecc_array = TagEccParam.map {
case _ =>
Expand All @@ -74,11 +93,20 @@ class TagArray(implicit p: Parameters) extends AbstractTagArray {
ecc
}

val wen = rst || io.write.valid
val wen = rst || io.tag_write.valid || io.meta_write.valid
val write_bitmask = WireInit(0.U((metaBits + tagBits).W))
when(io.meta_write.valid && io.tag_write.valid) {
write_bitmask := Fill(metaBits + tagBits, 1.U(1.W))
}.elsewhen(io.meta_write.valid && !io.tag_write.valid) {
write_bitmask := Cat(Fill(metaBits, 1.U(1.W)), Fill(tagBits, 0.U(1.W)))
}.elsewhen(!io.meta_write.valid && io.tag_write.valid) {
write_bitmask := Cat(Fill(metaBits, 0.U(1.W)), Fill(tagBits, 1.U(1.W)))
}
tag_array.io.w.req.valid := wen
tag_array.io.w.req.bits.apply(
setIdx = waddr,
data = wdata,
bitmask = write_bitmask,
waymask = VecInit(wmask).asUInt
)

Expand All @@ -103,7 +131,8 @@ class TagArray(implicit p: Parameters) extends AbstractTagArray {
tag_array.io.r.req.valid := ren
tag_array.io.r.req.bits.apply(setIdx = io.read.bits.idx)
tag_array.clock := ClockGate(false.B, ren | wen, clock)
io.resp := tag_array.io.r.resp.data
io.resp.tag := tag_array.io.r.resp.data.map(r => r(tagBits - 1, 0))
io.resp.meta := VecInit(tag_array.io.r.resp.data.map(r => r(metaBits + tagBits - 1, tagBits).asTypeOf(new Meta)))
XSPerfAccumulate("part_tag_read_counter", tag_array.io.r.req.valid)

val ecc_ren = io.ecc_read.fire
Expand All @@ -117,7 +146,8 @@ class TagArray(implicit p: Parameters) extends AbstractTagArray {
io.ecc_resp := 0.U.asTypeOf(io.ecc_resp)
}

io.write.ready := !rst
io.tag_write.ready := !rst
io.meta_write.ready := !rst
io.read.ready := !wen
ecc_array match {
case Some(ecc) =>
Expand All @@ -132,8 +162,9 @@ class TagArray(implicit p: Parameters) extends AbstractTagArray {
class DuplicatedTagArray(readPorts: Int)(implicit p: Parameters) extends AbstractTagArray {
val io = IO(new Bundle() {
val read = Vec(readPorts, Flipped(DecoupledIO(new TagReadReq)))
val resp = Output(Vec(readPorts, Vec(nWays, UInt(encTagBits.W))))
val write = Flipped(DecoupledIO(new TagWriteReq))
val resp = Output(Vec(readPorts, new TagMetaResp))
val tag_write = Flipped(DecoupledIO(new TagWriteReq))
val meta_write = Flipped(DecoupledIO(new CohMetaWriteReq))
// customized cache op port
val cacheOp = Flipped(new L1CacheInnerOpIO)
val cacheOp_req_dup = Vec(DCacheDupNum, Flipped(Valid(new CacheCtrlReqInfo)))
Expand All @@ -150,26 +181,30 @@ class DuplicatedTagArray(readPorts: Int)(implicit p: Parameters) extends Abstrac
val tag_read_oh = WireInit(VecInit(Seq.fill(readPorts)(0.U(XLEN.W))))
for (i <- 0 until readPorts) {
// normal read / write
array(i).io.write.valid := io.write.valid
array(i).io.write.bits := io.write.bits
array(i).io.ecc_write.valid := io.write.valid
array(i).io.ecc_write.bits.idx := io.write.bits.idx
array(i).io.ecc_write.bits.way_en := io.write.bits.way_en
val ecc = getECCFromEncTag(cacheParams.tagCode.encode(io.write.bits.tag))
array(i).io.tag_write.valid := io.tag_write.valid
array(i).io.tag_write.bits := io.tag_write.bits
array(i).io.meta_write.valid := io.meta_write.valid
array(i).io.meta_write.bits := io.meta_write.bits
array(i).io.ecc_write.valid := io.tag_write.valid
array(i).io.ecc_write.bits.idx := io.tag_write.bits.idx
array(i).io.ecc_write.bits.way_en := io.tag_write.bits.way_en
val ecc = getECCFromEncTag(cacheParams.tagCode.encode(io.tag_write.bits.tag))
array(i).io.ecc_write.bits.ecc := ecc

array(i).io.read <> io.read(i)
array(i).io.ecc_read.valid := io.read(i).valid
array(i).io.ecc_read.bits := io.read(i).bits
io.resp(i) := (array(i).io.ecc_resp zip array(i).io.resp).map { case (e, r) => Cat(e, r) }
io.resp(i).tag := (array(i).io.ecc_resp zip array(i).io.resp.tag).map { case (e, r) => Cat(e, r) }
io.resp(i).meta := array(i).io.resp.meta
// extra ports for cache op
// array(i).io.ecc_write.valid := false.B
// array(i).io.ecc_write.bits := DontCare
io.read(i).ready := array(i).io.read.ready && array(i).io.ecc_read.ready
tag_read_oh(i) := PopCount(array(i).io.read.fire)
}
XSPerfAccumulate("tag_read_counter", tag_read_oh.reduce(_ + _))
io.write.ready := true.B
io.tag_write.ready := true.B
io.meta_write.ready := true.B

require(nWays <= 32)
io.cacheOp.resp.bits := DontCare
Expand Down Expand Up @@ -198,10 +233,10 @@ class DuplicatedTagArray(readPorts: Int)(implicit p: Parameters) extends Abstrac
}
wdata_dup_vec.zipWithIndex.map{ case(dupIdx, idx) =>
when(io.cacheOp_req_dup(dupIdx).valid && isWriteTag(io.cacheOp_req_bits_opCode_dup(dupIdx))) {
array(idx).io.write.valid := true.B
array(idx).io.write.bits.idx := io.cacheOp.req.bits.index
array(idx).io.write.bits.way_en := UIntToOH(io.cacheOp.req.bits.wayNum(4, 0))
array(idx).io.write.bits.tag := io.cacheOp.req.bits.write_tag_low
array(idx).io.tag_write.valid := true.B
array(idx).io.tag_write.bits.idx := io.cacheOp.req.bits.index
array(idx).io.tag_write.bits.way_en := UIntToOH(io.cacheOp.req.bits.wayNum(4, 0))
array(idx).io.tag_write.bits.tag := io.cacheOp.req.bits.write_tag_low
cacheOpShouldResp := true.B
}
}
Expand All @@ -216,6 +251,6 @@ class DuplicatedTagArray(readPorts: Int)(implicit p: Parameters) extends Abstrac
}

io.cacheOp.resp.valid := RegNext(io.cacheOp.req.valid && cacheOpShouldResp)
io.cacheOp.resp.bits.read_tag_low := Mux(io.cacheOp.resp.valid, array(0).io.resp(RegNext(io.cacheOp.req.bits.wayNum)), 0.U)
io.cacheOp.resp.bits.read_tag_low := Mux(io.cacheOp.resp.valid, array(0).io.resp.tag(RegNext(io.cacheOp.req.bits.wayNum)), 0.U)
io.cacheOp.resp.bits.read_tag_ecc := Mux(io.cacheOp.resp.valid, array(0).io.ecc_resp(RegNext(io.cacheOp.req.bits.wayNum)), 0.U)
}
Loading