diff --git a/deploy/runtools/run_farm_deploy_managers.py b/deploy/runtools/run_farm_deploy_managers.py index f5f3ed27..ce2a7320 100644 --- a/deploy/runtools/run_farm_deploy_managers.py +++ b/deploy/runtools/run_farm_deploy_managers.py @@ -1099,7 +1099,7 @@ class RHSResearchNitefuryIIInstanceDeployManager(XilinxAlveoInstanceDeployManage collect = run('lspci | grep -i xilinx') bdfs = [ i[:7] for i in collect.splitlines() if len(i.strip()) >= 0 ] bdf = bdfs[slotno].replace('.', ':').split(':') - extra_args = f"+domain=0x0000 +bus=0x{bdf[0]} +device=0x{bdf[1]} +function=0x0 +bar=0x0 +pci-vendor=0x10ee +pci-device=0x7011" + extra_args = f"+domain=0x0000 +bus=0x{bdf[0]} +device=0x{bdf[1]} +function=0x0 +bar=0x0 +pci-vendor=0x10ee +pci-device=0x903f" else: extra_args = None diff --git a/platforms/rhsresearch_nitefury_ii/NiteFury-and-LiteFury-firesim b/platforms/rhsresearch_nitefury_ii/NiteFury-and-LiteFury-firesim index 24a56a67..f1dcf86f 160000 --- a/platforms/rhsresearch_nitefury_ii/NiteFury-and-LiteFury-firesim +++ b/platforms/rhsresearch_nitefury_ii/NiteFury-and-LiteFury-firesim @@ -1 +1 @@ -Subproject commit 24a56a672afc7cb5d79d04a139c9ecfee61fe32d +Subproject commit f1dcf86f46db0bbf27432190f5e694061ae12a81 diff --git a/sim/midas/src/main/cc/bridges/cpu_managed_stream.cc b/sim/midas/src/main/cc/bridges/cpu_managed_stream.cc index 768c9d35..d581f7e9 100644 --- a/sim/midas/src/main/cc/bridges/cpu_managed_stream.cc +++ b/sim/midas/src/main/cc/bridges/cpu_managed_stream.cc @@ -24,10 +24,10 @@ size_t CPUManagedStreams::CPUToFPGADriver::push(void *src, // implement non-multiples of 512b. The FPGA-side queue will take on the // high-order bytes of the final beat in the transaction, and the strobe is // not respected. So put the assertion here and discuss what to do next. - assert((num_bytes % beat_bytes()) == 0); + assert((num_bytes % fpga_buffer_width_bytes()) == 0); - auto num_beats = num_bytes / beat_bytes(); - auto threshold_beats = required_bytes / beat_bytes(); + auto num_beats = num_bytes / fpga_buffer_width_bytes(); + auto threshold_beats = required_bytes / fpga_buffer_width_bytes(); assert(threshold_beats <= fpga_buffer_size()); auto space_available = fpga_buffer_size() - mmio_read(count_addr()); @@ -37,7 +37,7 @@ size_t CPUManagedStreams::CPUToFPGADriver::push(void *src, } auto push_beats = std::min(space_available, num_beats); - auto push_bytes = push_beats * beat_bytes(); + auto push_bytes = push_beats * fpga_buffer_width_bytes(); auto bytes_written = cpu_managed_axi4_write(dma_addr(), (char *)src, push_bytes); assert(bytes_written == push_bytes); @@ -70,10 +70,10 @@ size_t CPUManagedStreams::FPGAToCPUDriver::pull(void *dest, // Due to the destructive nature of reads, if we wish to support reads that // aren't a multiple of 512b, we'll need to keep a little buffer around for // the remainder, and prepend this to the destination buffer. - assert((num_bytes % beat_bytes()) == 0); + assert((num_bytes % fpga_buffer_width_bytes()) == 0); - auto num_beats = num_bytes / beat_bytes(); - auto threshold_beats = required_bytes / beat_bytes(); + auto num_beats = num_bytes / fpga_buffer_width_bytes(); + auto threshold_beats = required_bytes / fpga_buffer_width_bytes(); assert(threshold_beats <= fpga_buffer_size()); auto count = mmio_read(count_addr()); @@ -83,7 +83,7 @@ size_t CPUManagedStreams::FPGAToCPUDriver::pull(void *dest, } auto pull_beats = std::min(count, num_beats); - auto pull_bytes = pull_beats * beat_bytes(); + auto pull_bytes = pull_beats * fpga_buffer_width_bytes(); auto bytes_read = cpu_managed_axi4_read(dma_addr(), (char *)dest, pull_bytes); assert(bytes_read == pull_bytes); return bytes_read; diff --git a/sim/midas/src/main/cc/bridges/cpu_managed_stream.h b/sim/midas/src/main/cc/bridges/cpu_managed_stream.h index dd30b106..27cf7e34 100644 --- a/sim/midas/src/main/cc/bridges/cpu_managed_stream.h +++ b/sim/midas/src/main/cc/bridges/cpu_managed_stream.h @@ -57,13 +57,16 @@ struct StreamParameters { uint64_t dma_addr; uint64_t count_addr; uint32_t fpga_buffer_size; + uint32_t fpga_buffer_width_bytes; StreamParameters(const std::string &stream_name, uint64_t dma_addr, uint64_t count_addr, - int fpga_buffer_size) + int fpga_buffer_size, + int fpga_buffer_width_bytes) : stream_name(stream_name), dma_addr(dma_addr), count_addr(count_addr), - fpga_buffer_size(fpga_buffer_size) {} + fpga_buffer_size(fpga_buffer_size), + fpga_buffer_width_bytes(fpga_buffer_width_bytes) {} }; /** @@ -106,7 +109,7 @@ public: int fpga_buffer_size() { return params.fpga_buffer_size; }; uint64_t dma_addr() { return params.dma_addr; }; uint64_t count_addr() { return params.count_addr; }; - uint64_t beat_bytes() const { return io.get_beat_bytes(); } + uint64_t fpga_buffer_width_bytes() const { return params.fpga_buffer_width_bytes; } }; /** diff --git a/sim/midas/src/main/scala/midas/Config.scala b/sim/midas/src/main/scala/midas/Config.scala index 96f34941..3011c9bd 100644 --- a/sim/midas/src/main/scala/midas/Config.scala +++ b/sim/midas/src/main/scala/midas/Config.scala @@ -128,7 +128,7 @@ class NitefuryConfig extends Config(new Config((site, here, up) => { case StreamEngineInstantiatorKey => (e: StreamEngineParameters, p: Parameters) => new CPUManagedStreamEngine(p, e) case CPUManagedAXI4Key => Some(CPUManagedAXI4Params( addrBits = 64, - dataBits = 512, + dataBits = 128, idBits = 4, )) case FPGAManagedAXI4Key => None diff --git a/sim/midas/src/main/scala/midas/core/CPUManagedStreamEngine.scala b/sim/midas/src/main/scala/midas/core/CPUManagedStreamEngine.scala index f8f352eb..dd9b7f5d 100644 --- a/sim/midas/src/main/scala/midas/core/CPUManagedStreamEngine.scala +++ b/sim/midas/src/main/scala/midas/core/CPUManagedStreamEngine.scala @@ -14,6 +14,54 @@ import freechips.rocketchip.util.DecoupledHelper import midas.targetutils.xdc import midas.widgets._ +class StreamAdapterIO(val w: Int) extends Bundle { + val in = Flipped(Decoupled(UInt(w.W))) + val out = Decoupled(UInt(w.W)) + + def flipConnect(other: StreamAdapterIO) { + in <> other.out + other.in <> out + } +} + +class StreamWidthAdapter(narrowW: Int, wideW: Int) extends Module { + require(wideW >= narrowW) + require(wideW % narrowW == 0) + val io = IO(new Bundle { + val narrow = new StreamAdapterIO(narrowW) + val wide = new StreamAdapterIO(wideW) + }) + + if (wideW == narrowW) { + io.narrow.out <> io.wide.in + io.wide.out <> io.narrow.in + } else { + val beats = wideW / narrowW + + val narrow_beats = RegInit(0.U(log2Ceil(beats).W)) + val narrow_last_beat = narrow_beats === (beats-1).U + val narrow_data = Reg(Vec(beats-1, UInt(narrowW.W))) + + val wide_beats = RegInit(0.U(log2Ceil(beats).W)) + val wide_last_beat = wide_beats === (beats-1).U + + io.narrow.in.ready := Mux(narrow_last_beat, io.wide.out.ready, true.B) + when (io.narrow.in.fire()) { + narrow_beats := Mux(narrow_last_beat, 0.U, narrow_beats + 1.U) + when (!narrow_last_beat) { narrow_data(narrow_beats) := io.narrow.in.bits } + } + io.wide.out.valid := narrow_last_beat && io.narrow.in.valid + io.wide.out.bits := Cat(io.narrow.in.bits, narrow_data.asUInt) + + io.narrow.out.valid := io.wide.in.valid + io.narrow.out.bits := io.wide.in.bits >> (wide_beats << 3) + when (io.narrow.out.fire()) { + wide_beats := Mux(wide_last_beat, 0.U, wide_beats + 1.U) + } + io.wide.in.ready := wide_last_beat && io.narrow.out.ready + } +} + /** * A helper container to serialize per-stream constants to the header. This is * currently somewhat redundant with the default header emission for widgets. @@ -22,15 +70,17 @@ case class StreamDriverParameters( name: String, bufferBaseAddress: Int, countMMIOAddress: Int, - bufferCapacity: Int) + bufferCapacity: Int, + bufferWidthBytes: Int) class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters) extends StreamEngine(p) { val cpuManagedAXI4params = p(CPUManagedAXI4Key).get - require(BridgeStreamConstants.streamWidthBits == cpuManagedAXI4params.dataBits, - s"CPU-managed AXI4 IF data width must match the stream width: ${BridgeStreamConstants.streamWidthBits}.") + require(BridgeStreamConstants.streamWidthBits >= cpuManagedAXI4params.dataBits, + s"CPU-managed AXI4 IF data width (${cpuManagedAXI4params.dataBits}) must be less than or equal to the stream width (${BridgeStreamConstants.streamWidthBits}).") - val beatBytes = cpuManagedAXI4params.dataBits / 8 + val axiBeatBytes = cpuManagedAXI4params.dataBits / 8 + val bufferWidthBytes = BridgeStreamConstants.streamWidthBits / 8 val cpuManagedAXI4NodeOpt = Some(AXI4SlaveNode( Seq(AXI4SlavePortParameters( @@ -39,10 +89,10 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters) resources = (new MemoryDevice).reg, regionType = RegionType.UNCACHED, // cacheable executable = false, - supportsWrite = TransferSizes(beatBytes, 4096), - supportsRead = TransferSizes(beatBytes, 4096), + supportsWrite = TransferSizes(axiBeatBytes, 4096), + supportsRead = TransferSizes(axiBeatBytes, 4096), interleavedId = Some(0))), // slave does not interleave read responses - beatBytes = beatBytes) + beatBytes = axiBeatBytes) )) ) @@ -56,8 +106,8 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters) // FromHostCPU streams are implemented using the AW, W, B channels, which // write into large BRAM FIFOs for each stream. - assert(!axi4.aw.valid || axi4.aw.bits.size === log2Ceil(beatBytes).U) - assert(!axi4.w.valid || axi4.w.bits.strb === ~0.U(beatBytes.W)) + assert(!axi4.aw.valid || axi4.aw.bits.size === log2Ceil(axiBeatBytes).U) + assert(!axi4.w.valid || axi4.w.bits.strb === ~0.U(axiBeatBytes.W)) axi4.b.bits.resp := 0.U(2.W) axi4.b.bits.id := axi4.aw.bits.id @@ -67,7 +117,6 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters) axi4.aw.ready := false.B axi4.w.ready := false.B - // TODO: Chisel naming prefix to indicate what channel this hw belongs to. // This demultiplexes the AW, W, and B channels onto the decoupled ports representing each stream. def elaborateFromHostCPUStream( @@ -76,6 +125,12 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters) idx: Int, addressSpaceBits: Int): StreamDriverParameters = prefix(chParams.name) { + val ser_des = Module(new StreamWidthAdapter(cpuManagedAXI4params.dataBits, BridgeStreamConstants.streamWidthBits)) + // unused + ser_des.io.wide.in.bits := 0.U + ser_des.io.wide.in.valid := false.B + ser_des.io.narrow.out.ready := false.B + val streamName = chParams.name val grant = (axi4.aw.bits.addr >> addressSpaceBits) === idx.U @@ -88,11 +143,15 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters) val countAddr = attach(incomingQueue.io.count, s"${chParams.name}_count", ReadOnly, substruct = false) + incomingQueue.io.enq.bits := ser_des.io.wide.out.bits + incomingQueue.io.enq.valid := ser_des.io.wide.out.valid + ser_des.io.wide.out.ready := incomingQueue.io.enq.ready + val writeHelper = DecoupledHelper( axi4.aw.valid, axi4.w.valid, axi4.b.ready, - incomingQueue.io.enq.ready + ser_des.io.narrow.in.ready ) // TODO: Get rid of this magic number. @@ -108,18 +167,19 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters) axi4.b.valid := writeHelper.fire(axi4.b.ready, lastWriteBeat) } - incomingQueue.io.enq.valid := grant && writeHelper.fire(incomingQueue.io.enq.ready) - incomingQueue.io.enq.bits := axi4.w.bits.data + ser_des.io.narrow.in.valid := grant && writeHelper.fire(ser_des.io.narrow.in.ready) + ser_des.io.narrow.in.bits := axi4.w.bits.data StreamDriverParameters( chParams.name, idx * (1 << addressSpaceBits), countAddr, - chParams.fpgaBufferDepth + chParams.fpgaBufferDepth, + chParams.fpgaBufferWidthBytes ) } - assert(!axi4.ar.valid || axi4.ar.bits.size === log2Ceil(beatBytes).U) + assert(!axi4.ar.valid || axi4.ar.bits.size === log2Ceil(axiBeatBytes).U) axi4.r.bits.resp := 0.U(2.W) axi4.r.bits.id := axi4.ar.bits.id @@ -134,6 +194,13 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters) idx: Int, addressSpaceBits: Int): StreamDriverParameters = prefix(chParams.name) { + + val ser_des = Module(new StreamWidthAdapter(cpuManagedAXI4params.dataBits, BridgeStreamConstants.streamWidthBits)) + // unused + ser_des.io.narrow.in.bits := 0.U + ser_des.io.narrow.in.valid := false.B + ser_des.io.wide.out.ready := false.B + val grant = (axi4.ar.bits.addr >> addressSpaceBits) === idx.U val outgoingQueue = Module(new BRAMQueue(chParams.fpgaBufferDepth)(UInt(BridgeStreamConstants.streamWidthBits.W))) @@ -141,6 +208,10 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters) outgoingQueue.io.enq <> channel + ser_des.io.wide.in.bits := outgoingQueue.io.deq.bits + ser_des.io.wide.in.valid := outgoingQueue.io.deq.valid + outgoingQueue.io.deq.ready := ser_des.io.wide.in.ready + // check to see if axi4 has valid output instead of waiting for timeouts val countAddr = attach(outgoingQueue.io.count, s"${chParams.name}_count", ReadOnly, substruct = false) @@ -148,7 +219,7 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters) val readHelper = DecoupledHelper( axi4.ar.valid, axi4.r.ready, - outgoingQueue.io.deq.valid + ser_des.io.narrow.out.valid ) val readBeatCounter = RegInit(0.U(9.W)) @@ -157,11 +228,11 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters) readBeatCounter := Mux(lastReadBeat, 0.U, readBeatCounter + 1.U) } - outgoingQueue.io.deq.ready := grant && readHelper.fire(outgoingQueue.io.deq.valid) + ser_des.io.narrow.out.ready := grant && readHelper.fire(ser_des.io.narrow.out.valid) when (grant) { axi4.r.valid := readHelper.fire(axi4.r.ready) - axi4.r.bits.data := outgoingQueue.io.deq.bits + axi4.r.bits.data := ser_des.io.narrow.out.bits axi4.r.bits.last := lastReadBeat axi4.ar.ready := readHelper.fire(axi4.ar.valid, lastReadBeat) } @@ -169,7 +240,8 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters) chParams.name, idx * (1 << addressSpaceBits), countAddr, - chParams.fpgaBufferDepth) + chParams.fpgaBufferDepth, + chParams.fpgaBufferWidthBytes) } def implementStreams[A <: StreamParameters]( @@ -187,7 +259,7 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters) // fractured into multiple, smaller AXI4 transactions (<= 4K in size), it // is simplest to maintain the illusion that each stream is granted an // address range at least as large as the largest DMA access. - def streamASBits = log2Ceil(beatBytes * streamParameters.map(_.fpgaBufferDepth).max) + def streamASBits = log2Ceil(bufferWidthBytes * streamParameters.map(_.fpgaBufferDepth).max) for (((port, params), idx) <- streamPorts.zip(streamParameters).zipWithIndex) yield { elaborator(port, params, idx, streamASBits) @@ -206,7 +278,8 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters) | std::string(${CStrLit(p.name).toC}), | ${UInt64(p.bufferBaseAddress).toC}, | ${UInt64(base + p.countMMIOAddress).toC}, - | ${UInt32(p.bufferCapacity).toC} + | ${UInt32(p.bufferCapacity).toC}, + | ${UInt32(p.bufferWidthBytes).toC} |)""".stripMargin))) } diff --git a/sim/midas/src/main/scala/midas/core/StreamEngine.scala b/sim/midas/src/main/scala/midas/core/StreamEngine.scala index 58c4864d..20480535 100644 --- a/sim/midas/src/main/scala/midas/core/StreamEngine.scala +++ b/sim/midas/src/main/scala/midas/core/StreamEngine.scala @@ -26,15 +26,16 @@ trait StreamParameters { def name: String def idx: Int def fpgaBufferDepth: Int + def fpgaBufferWidthBytes: Int /** * Pretty prints a description of this stream. */ def summaryString: String = - s"${name}, FPGA Buffer Depth: ${fpgaBufferDepth} Beats" + s"Name: ${name}, Idx: ${idx}, FPGA Buffer Depth: ${fpgaBufferDepth}, FPGA Buffer Width: ${fpgaBufferWidthBytes}" } -case class StreamSourceParameters(name: String, idx: Int, fpgaBufferDepth: Int) extends StreamParameters -case class StreamSinkParameters (name: String, idx: Int, fpgaBufferDepth: Int) extends StreamParameters +case class StreamSourceParameters(name: String, idx: Int, fpgaBufferDepth: Int, fpgaBufferWidthBytes: Int) extends StreamParameters +case class StreamSinkParameters (name: String, idx: Int, fpgaBufferDepth: Int, fpgaBufferWidthBytes: Int) extends StreamParameters /** * A wrapper class for common arguments to all StreamEngine implementations. diff --git a/sim/midas/src/main/scala/midas/widgets/UsesBridgeStreams.scala b/sim/midas/src/main/scala/midas/widgets/UsesBridgeStreams.scala index 660696ef..e0f698a1 100644 --- a/sim/midas/src/main/scala/midas/widgets/UsesBridgeStreams.scala +++ b/sim/midas/src/main/scala/midas/widgets/UsesBridgeStreams.scala @@ -58,7 +58,8 @@ trait StreamFromHostCPU { self: Widget => final def streamSinkParams = StreamSinkParameters( fromHostStreamName, fromHostStreamIdx, - fromHostCPUQueueDepth) + fromHostCPUQueueDepth, + BridgeStreamConstants.streamWidthBits/8) private val _streamDeq = InModuleBody { @@ -83,7 +84,8 @@ trait StreamToHostCPU { self: Widget => final def streamSourceParams = StreamSourceParameters( toHostStreamName, toHostStreamIdx, - toHostCPUQueueDepth) + toHostCPUQueueDepth, + BridgeStreamConstants.streamWidthBits/8) private val _streamEnq = InModuleBody { val streamToHostCPU = IO(BridgeStreamConstants.streamChiselType)