decouple queue interface width exposed to streaming bridges from AXI4 DMA width

This commit is contained in:
Sagar Karandikar 2023-05-30 09:22:21 -07:00
parent 77891f3266
commit b3dc0b0eb1
8 changed files with 119 additions and 40 deletions

View File

@ -1099,7 +1099,7 @@ class RHSResearchNitefuryIIInstanceDeployManager(XilinxAlveoInstanceDeployManage
collect = run('lspci | grep -i xilinx')
bdfs = [ i[:7] for i in collect.splitlines() if len(i.strip()) >= 0 ]
bdf = bdfs[slotno].replace('.', ':').split(':')
extra_args = f"+domain=0x0000 +bus=0x{bdf[0]} +device=0x{bdf[1]} +function=0x0 +bar=0x0 +pci-vendor=0x10ee +pci-device=0x7011"
extra_args = f"+domain=0x0000 +bus=0x{bdf[0]} +device=0x{bdf[1]} +function=0x0 +bar=0x0 +pci-vendor=0x10ee +pci-device=0x903f"
else:
extra_args = None

@ -1 +1 @@
Subproject commit 24a56a672afc7cb5d79d04a139c9ecfee61fe32d
Subproject commit f1dcf86f46db0bbf27432190f5e694061ae12a81

View File

@ -24,10 +24,10 @@ size_t CPUManagedStreams::CPUToFPGADriver::push(void *src,
// implement non-multiples of 512b. The FPGA-side queue will take on the
// high-order bytes of the final beat in the transaction, and the strobe is
// not respected. So put the assertion here and discuss what to do next.
assert((num_bytes % beat_bytes()) == 0);
assert((num_bytes % fpga_buffer_width_bytes()) == 0);
auto num_beats = num_bytes / beat_bytes();
auto threshold_beats = required_bytes / beat_bytes();
auto num_beats = num_bytes / fpga_buffer_width_bytes();
auto threshold_beats = required_bytes / fpga_buffer_width_bytes();
assert(threshold_beats <= fpga_buffer_size());
auto space_available = fpga_buffer_size() - mmio_read(count_addr());
@ -37,7 +37,7 @@ size_t CPUManagedStreams::CPUToFPGADriver::push(void *src,
}
auto push_beats = std::min(space_available, num_beats);
auto push_bytes = push_beats * beat_bytes();
auto push_bytes = push_beats * fpga_buffer_width_bytes();
auto bytes_written =
cpu_managed_axi4_write(dma_addr(), (char *)src, push_bytes);
assert(bytes_written == push_bytes);
@ -70,10 +70,10 @@ size_t CPUManagedStreams::FPGAToCPUDriver::pull(void *dest,
// Due to the destructive nature of reads, if we wish to support reads that
// aren't a multiple of 512b, we'll need to keep a little buffer around for
// the remainder, and prepend this to the destination buffer.
assert((num_bytes % beat_bytes()) == 0);
assert((num_bytes % fpga_buffer_width_bytes()) == 0);
auto num_beats = num_bytes / beat_bytes();
auto threshold_beats = required_bytes / beat_bytes();
auto num_beats = num_bytes / fpga_buffer_width_bytes();
auto threshold_beats = required_bytes / fpga_buffer_width_bytes();
assert(threshold_beats <= fpga_buffer_size());
auto count = mmio_read(count_addr());
@ -83,7 +83,7 @@ size_t CPUManagedStreams::FPGAToCPUDriver::pull(void *dest,
}
auto pull_beats = std::min(count, num_beats);
auto pull_bytes = pull_beats * beat_bytes();
auto pull_bytes = pull_beats * fpga_buffer_width_bytes();
auto bytes_read = cpu_managed_axi4_read(dma_addr(), (char *)dest, pull_bytes);
assert(bytes_read == pull_bytes);
return bytes_read;

View File

@ -57,13 +57,16 @@ struct StreamParameters {
uint64_t dma_addr;
uint64_t count_addr;
uint32_t fpga_buffer_size;
uint32_t fpga_buffer_width_bytes;
StreamParameters(const std::string &stream_name,
uint64_t dma_addr,
uint64_t count_addr,
int fpga_buffer_size)
int fpga_buffer_size,
int fpga_buffer_width_bytes)
: stream_name(stream_name), dma_addr(dma_addr), count_addr(count_addr),
fpga_buffer_size(fpga_buffer_size) {}
fpga_buffer_size(fpga_buffer_size),
fpga_buffer_width_bytes(fpga_buffer_width_bytes) {}
};
/**
@ -106,7 +109,7 @@ public:
int fpga_buffer_size() { return params.fpga_buffer_size; };
uint64_t dma_addr() { return params.dma_addr; };
uint64_t count_addr() { return params.count_addr; };
uint64_t beat_bytes() const { return io.get_beat_bytes(); }
uint64_t fpga_buffer_width_bytes() const { return params.fpga_buffer_width_bytes; }
};
/**

View File

@ -128,7 +128,7 @@ class NitefuryConfig extends Config(new Config((site, here, up) => {
case StreamEngineInstantiatorKey => (e: StreamEngineParameters, p: Parameters) => new CPUManagedStreamEngine(p, e)
case CPUManagedAXI4Key => Some(CPUManagedAXI4Params(
addrBits = 64,
dataBits = 512,
dataBits = 128,
idBits = 4,
))
case FPGAManagedAXI4Key => None

View File

@ -14,6 +14,54 @@ import freechips.rocketchip.util.DecoupledHelper
import midas.targetutils.xdc
import midas.widgets._
class StreamAdapterIO(val w: Int) extends Bundle {
val in = Flipped(Decoupled(UInt(w.W)))
val out = Decoupled(UInt(w.W))
def flipConnect(other: StreamAdapterIO) {
in <> other.out
other.in <> out
}
}
class StreamWidthAdapter(narrowW: Int, wideW: Int) extends Module {
require(wideW >= narrowW)
require(wideW % narrowW == 0)
val io = IO(new Bundle {
val narrow = new StreamAdapterIO(narrowW)
val wide = new StreamAdapterIO(wideW)
})
if (wideW == narrowW) {
io.narrow.out <> io.wide.in
io.wide.out <> io.narrow.in
} else {
val beats = wideW / narrowW
val narrow_beats = RegInit(0.U(log2Ceil(beats).W))
val narrow_last_beat = narrow_beats === (beats-1).U
val narrow_data = Reg(Vec(beats-1, UInt(narrowW.W)))
val wide_beats = RegInit(0.U(log2Ceil(beats).W))
val wide_last_beat = wide_beats === (beats-1).U
io.narrow.in.ready := Mux(narrow_last_beat, io.wide.out.ready, true.B)
when (io.narrow.in.fire()) {
narrow_beats := Mux(narrow_last_beat, 0.U, narrow_beats + 1.U)
when (!narrow_last_beat) { narrow_data(narrow_beats) := io.narrow.in.bits }
}
io.wide.out.valid := narrow_last_beat && io.narrow.in.valid
io.wide.out.bits := Cat(io.narrow.in.bits, narrow_data.asUInt)
io.narrow.out.valid := io.wide.in.valid
io.narrow.out.bits := io.wide.in.bits >> (wide_beats << 3)
when (io.narrow.out.fire()) {
wide_beats := Mux(wide_last_beat, 0.U, wide_beats + 1.U)
}
io.wide.in.ready := wide_last_beat && io.narrow.out.ready
}
}
/**
* A helper container to serialize per-stream constants to the header. This is
* currently somewhat redundant with the default header emission for widgets.
@ -22,15 +70,17 @@ case class StreamDriverParameters(
name: String,
bufferBaseAddress: Int,
countMMIOAddress: Int,
bufferCapacity: Int)
bufferCapacity: Int,
bufferWidthBytes: Int)
class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters) extends StreamEngine(p) {
val cpuManagedAXI4params = p(CPUManagedAXI4Key).get
require(BridgeStreamConstants.streamWidthBits == cpuManagedAXI4params.dataBits,
s"CPU-managed AXI4 IF data width must match the stream width: ${BridgeStreamConstants.streamWidthBits}.")
require(BridgeStreamConstants.streamWidthBits >= cpuManagedAXI4params.dataBits,
s"CPU-managed AXI4 IF data width (${cpuManagedAXI4params.dataBits}) must be less than or equal to the stream width (${BridgeStreamConstants.streamWidthBits}).")
val beatBytes = cpuManagedAXI4params.dataBits / 8
val axiBeatBytes = cpuManagedAXI4params.dataBits / 8
val bufferWidthBytes = BridgeStreamConstants.streamWidthBits / 8
val cpuManagedAXI4NodeOpt = Some(AXI4SlaveNode(
Seq(AXI4SlavePortParameters(
@ -39,10 +89,10 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters)
resources = (new MemoryDevice).reg,
regionType = RegionType.UNCACHED, // cacheable
executable = false,
supportsWrite = TransferSizes(beatBytes, 4096),
supportsRead = TransferSizes(beatBytes, 4096),
supportsWrite = TransferSizes(axiBeatBytes, 4096),
supportsRead = TransferSizes(axiBeatBytes, 4096),
interleavedId = Some(0))), // slave does not interleave read responses
beatBytes = beatBytes)
beatBytes = axiBeatBytes)
))
)
@ -56,8 +106,8 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters)
// FromHostCPU streams are implemented using the AW, W, B channels, which
// write into large BRAM FIFOs for each stream.
assert(!axi4.aw.valid || axi4.aw.bits.size === log2Ceil(beatBytes).U)
assert(!axi4.w.valid || axi4.w.bits.strb === ~0.U(beatBytes.W))
assert(!axi4.aw.valid || axi4.aw.bits.size === log2Ceil(axiBeatBytes).U)
assert(!axi4.w.valid || axi4.w.bits.strb === ~0.U(axiBeatBytes.W))
axi4.b.bits.resp := 0.U(2.W)
axi4.b.bits.id := axi4.aw.bits.id
@ -67,7 +117,6 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters)
axi4.aw.ready := false.B
axi4.w.ready := false.B
// TODO: Chisel naming prefix to indicate what channel this hw belongs to.
// This demultiplexes the AW, W, and B channels onto the decoupled ports representing each stream.
def elaborateFromHostCPUStream(
@ -76,6 +125,12 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters)
idx: Int,
addressSpaceBits: Int): StreamDriverParameters = prefix(chParams.name) {
val ser_des = Module(new StreamWidthAdapter(cpuManagedAXI4params.dataBits, BridgeStreamConstants.streamWidthBits))
// unused
ser_des.io.wide.in.bits := 0.U
ser_des.io.wide.in.valid := false.B
ser_des.io.narrow.out.ready := false.B
val streamName = chParams.name
val grant = (axi4.aw.bits.addr >> addressSpaceBits) === idx.U
@ -88,11 +143,15 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters)
val countAddr =
attach(incomingQueue.io.count, s"${chParams.name}_count", ReadOnly, substruct = false)
incomingQueue.io.enq.bits := ser_des.io.wide.out.bits
incomingQueue.io.enq.valid := ser_des.io.wide.out.valid
ser_des.io.wide.out.ready := incomingQueue.io.enq.ready
val writeHelper = DecoupledHelper(
axi4.aw.valid,
axi4.w.valid,
axi4.b.ready,
incomingQueue.io.enq.ready
ser_des.io.narrow.in.ready
)
// TODO: Get rid of this magic number.
@ -108,18 +167,19 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters)
axi4.b.valid := writeHelper.fire(axi4.b.ready, lastWriteBeat)
}
incomingQueue.io.enq.valid := grant && writeHelper.fire(incomingQueue.io.enq.ready)
incomingQueue.io.enq.bits := axi4.w.bits.data
ser_des.io.narrow.in.valid := grant && writeHelper.fire(ser_des.io.narrow.in.ready)
ser_des.io.narrow.in.bits := axi4.w.bits.data
StreamDriverParameters(
chParams.name,
idx * (1 << addressSpaceBits),
countAddr,
chParams.fpgaBufferDepth
chParams.fpgaBufferDepth,
chParams.fpgaBufferWidthBytes
)
}
assert(!axi4.ar.valid || axi4.ar.bits.size === log2Ceil(beatBytes).U)
assert(!axi4.ar.valid || axi4.ar.bits.size === log2Ceil(axiBeatBytes).U)
axi4.r.bits.resp := 0.U(2.W)
axi4.r.bits.id := axi4.ar.bits.id
@ -134,6 +194,13 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters)
idx: Int,
addressSpaceBits: Int): StreamDriverParameters = prefix(chParams.name) {
val ser_des = Module(new StreamWidthAdapter(cpuManagedAXI4params.dataBits, BridgeStreamConstants.streamWidthBits))
// unused
ser_des.io.narrow.in.bits := 0.U
ser_des.io.narrow.in.valid := false.B
ser_des.io.wide.out.ready := false.B
val grant = (axi4.ar.bits.addr >> addressSpaceBits) === idx.U
val outgoingQueue = Module(new BRAMQueue(chParams.fpgaBufferDepth)(UInt(BridgeStreamConstants.streamWidthBits.W)))
@ -141,6 +208,10 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters)
outgoingQueue.io.enq <> channel
ser_des.io.wide.in.bits := outgoingQueue.io.deq.bits
ser_des.io.wide.in.valid := outgoingQueue.io.deq.valid
outgoingQueue.io.deq.ready := ser_des.io.wide.in.ready
// check to see if axi4 has valid output instead of waiting for timeouts
val countAddr =
attach(outgoingQueue.io.count, s"${chParams.name}_count", ReadOnly, substruct = false)
@ -148,7 +219,7 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters)
val readHelper = DecoupledHelper(
axi4.ar.valid,
axi4.r.ready,
outgoingQueue.io.deq.valid
ser_des.io.narrow.out.valid
)
val readBeatCounter = RegInit(0.U(9.W))
@ -157,11 +228,11 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters)
readBeatCounter := Mux(lastReadBeat, 0.U, readBeatCounter + 1.U)
}
outgoingQueue.io.deq.ready := grant && readHelper.fire(outgoingQueue.io.deq.valid)
ser_des.io.narrow.out.ready := grant && readHelper.fire(ser_des.io.narrow.out.valid)
when (grant) {
axi4.r.valid := readHelper.fire(axi4.r.ready)
axi4.r.bits.data := outgoingQueue.io.deq.bits
axi4.r.bits.data := ser_des.io.narrow.out.bits
axi4.r.bits.last := lastReadBeat
axi4.ar.ready := readHelper.fire(axi4.ar.valid, lastReadBeat)
}
@ -169,7 +240,8 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters)
chParams.name,
idx * (1 << addressSpaceBits),
countAddr,
chParams.fpgaBufferDepth)
chParams.fpgaBufferDepth,
chParams.fpgaBufferWidthBytes)
}
def implementStreams[A <: StreamParameters](
@ -187,7 +259,7 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters)
// fractured into multiple, smaller AXI4 transactions (<= 4K in size), it
// is simplest to maintain the illusion that each stream is granted an
// address range at least as large as the largest DMA access.
def streamASBits = log2Ceil(beatBytes * streamParameters.map(_.fpgaBufferDepth).max)
def streamASBits = log2Ceil(bufferWidthBytes * streamParameters.map(_.fpgaBufferDepth).max)
for (((port, params), idx) <- streamPorts.zip(streamParameters).zipWithIndex) yield {
elaborator(port, params, idx, streamASBits)
@ -206,7 +278,8 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters)
| std::string(${CStrLit(p.name).toC}),
| ${UInt64(p.bufferBaseAddress).toC},
| ${UInt64(base + p.countMMIOAddress).toC},
| ${UInt32(p.bufferCapacity).toC}
| ${UInt32(p.bufferCapacity).toC},
| ${UInt32(p.bufferWidthBytes).toC}
|)""".stripMargin)))
}

View File

@ -26,15 +26,16 @@ trait StreamParameters {
def name: String
def idx: Int
def fpgaBufferDepth: Int
def fpgaBufferWidthBytes: Int
/**
* Pretty prints a description of this stream.
*/
def summaryString: String =
s"${name}, FPGA Buffer Depth: ${fpgaBufferDepth} Beats"
s"Name: ${name}, Idx: ${idx}, FPGA Buffer Depth: ${fpgaBufferDepth}, FPGA Buffer Width: ${fpgaBufferWidthBytes}"
}
case class StreamSourceParameters(name: String, idx: Int, fpgaBufferDepth: Int) extends StreamParameters
case class StreamSinkParameters (name: String, idx: Int, fpgaBufferDepth: Int) extends StreamParameters
case class StreamSourceParameters(name: String, idx: Int, fpgaBufferDepth: Int, fpgaBufferWidthBytes: Int) extends StreamParameters
case class StreamSinkParameters (name: String, idx: Int, fpgaBufferDepth: Int, fpgaBufferWidthBytes: Int) extends StreamParameters
/**
* A wrapper class for common arguments to all StreamEngine implementations.

View File

@ -58,7 +58,8 @@ trait StreamFromHostCPU { self: Widget =>
final def streamSinkParams = StreamSinkParameters(
fromHostStreamName,
fromHostStreamIdx,
fromHostCPUQueueDepth)
fromHostCPUQueueDepth,
BridgeStreamConstants.streamWidthBits/8)
private val _streamDeq = InModuleBody {
@ -83,7 +84,8 @@ trait StreamToHostCPU { self: Widget =>
final def streamSourceParams = StreamSourceParameters(
toHostStreamName,
toHostStreamIdx,
toHostCPUQueueDepth)
toHostCPUQueueDepth,
BridgeStreamConstants.streamWidthBits/8)
private val _streamEnq = InModuleBody {
val streamToHostCPU = IO(BridgeStreamConstants.streamChiselType)