diff --git a/src/main/scala/rocket/DCache.scala b/src/main/scala/rocket/DCache.scala index dca16a84..f8cf1388 100644 --- a/src/main/scala/rocket/DCache.scala +++ b/src/main/scala/rocket/DCache.scala @@ -828,6 +828,11 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) { io.cpu.perf.release := edge.done(tl_out_c) io.cpu.perf.grant := d_done io.cpu.perf.tlbMiss := io.ptw.req.fire() + io.cpu.perf.blocked := { + // stop reporting blocked just before unblocking to avoid overly conservative stalling + val cycles = outer.bufferUncachedRequests.map(n => if (n > 1) 1 else 2).getOrElse(2) + cached_grant_wait && d_address_inc < ((cacheBlockBytes - cycles * beatBytes) max 0) + } // report errors val (data_error, data_error_uncorrectable, data_error_addr) = diff --git a/src/main/scala/rocket/HellaCache.scala b/src/main/scala/rocket/HellaCache.scala index ba539151..06adce0e 100644 --- a/src/main/scala/rocket/HellaCache.scala +++ b/src/main/scala/rocket/HellaCache.scala @@ -137,6 +137,7 @@ class HellaCachePerfEvents extends Bundle { val release = Bool() val grant = Bool() val tlbMiss = Bool() + val blocked = Bool() } // interface between D$ and processor/DTLB diff --git a/src/main/scala/rocket/HellaCacheArbiter.scala b/src/main/scala/rocket/HellaCacheArbiter.scala index 49d759f9..4d9997e5 100644 --- a/src/main/scala/rocket/HellaCacheArbiter.scala +++ b/src/main/scala/rocket/HellaCacheArbiter.scala @@ -64,6 +64,7 @@ class HellaCacheArbiter(n: Int)(implicit p: Parameters) extends Module io.requestor(i).perf := io.mem.perf io.requestor(i).s2_nack := io.mem.s2_nack && s2_id === UInt(i) io.requestor(i).s2_nack_cause_raw := io.mem.s2_nack_cause_raw + io.requestor(i).clock_enabled := io.mem.clock_enabled resp.bits := io.mem.resp.bits resp.bits.tag := io.mem.resp.bits.tag >> log2Up(n) diff --git a/src/main/scala/rocket/RocketCore.scala b/src/main/scala/rocket/RocketCore.scala index 3db0b901..59715cc1 100644 --- a/src/main/scala/rocket/RocketCore.scala +++ b/src/main/scala/rocket/RocketCore.scala @@ -87,6 +87,7 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p) with HasCoreIO { val clock_en_reg = RegInit(true.B) + val long_latency_stall = Reg(Bool()) val imem_might_request_reg = Reg(Bool()) val clock_en = Wire(init=true.B) val gated_clock = @@ -671,8 +672,12 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p) checkHazards(fp_hazard_targets, fp_sboard.read _) } else Bool(false) - val dcache_blocked = Reg(Bool()) - dcache_blocked := !io.dmem.req.ready && io.dmem.clock_enabled && (io.dmem.req.valid || dcache_blocked) + val dcache_blocked = { + // speculate that a blocked D$ will unblock the cycle after a Grant + val blocked = Reg(Bool()) + blocked := !io.dmem.req.ready && io.dmem.clock_enabled && !io.dmem.perf.grant && (blocked || io.dmem.req.valid || io.dmem.s2_nack) + blocked && !io.dmem.perf.grant + } val rocc_blocked = Reg(Bool()) rocc_blocked := !wb_xcpt && !io.rocc.cmd.ready && (io.rocc.cmd.valid || rocc_blocked) @@ -762,14 +767,15 @@ class Rocket(implicit p: Parameters) extends CoreModule()(p) // gate the clock if (rocketParams.clockGate) { - clock_en := clock_en_reg || (!csr.io.csr_stall && io.imem.resp.valid) + long_latency_stall := csr.io.csr_stall || io.dmem.perf.blocked + clock_en := clock_en_reg || (!long_latency_stall && io.imem.resp.valid) clock_en_reg := ex_pc_valid || mem_pc_valid || wb_pc_valid || // instruction in flight io.ptw.customCSRs.disableCoreClockGate || // chicken bit !div.io.req.ready || // mul/div in flight usingFPU && !io.fpu.fcsr_rdy || // long-latency FPU in flight io.dmem.replay_next || // long-latency load replaying - (!csr.io.csr_stall && (ibuf.io.inst(0).valid || io.imem.resp.valid)) // instruction pending + (!long_latency_stall && (ibuf.io.inst(0).valid || io.imem.resp.valid)) // instruction pending } // evaluate performance counters