decouple queue interface width exposed to streaming bridges from AXI4 DMA width
This commit is contained in:
parent
77891f3266
commit
b3dc0b0eb1
|
@ -1099,7 +1099,7 @@ class RHSResearchNitefuryIIInstanceDeployManager(XilinxAlveoInstanceDeployManage
|
|||
collect = run('lspci | grep -i xilinx')
|
||||
bdfs = [ i[:7] for i in collect.splitlines() if len(i.strip()) >= 0 ]
|
||||
bdf = bdfs[slotno].replace('.', ':').split(':')
|
||||
extra_args = f"+domain=0x0000 +bus=0x{bdf[0]} +device=0x{bdf[1]} +function=0x0 +bar=0x0 +pci-vendor=0x10ee +pci-device=0x7011"
|
||||
extra_args = f"+domain=0x0000 +bus=0x{bdf[0]} +device=0x{bdf[1]} +function=0x0 +bar=0x0 +pci-vendor=0x10ee +pci-device=0x903f"
|
||||
else:
|
||||
extra_args = None
|
||||
|
||||
|
|
|
@ -1 +1 @@
|
|||
Subproject commit 24a56a672afc7cb5d79d04a139c9ecfee61fe32d
|
||||
Subproject commit f1dcf86f46db0bbf27432190f5e694061ae12a81
|
|
@ -24,10 +24,10 @@ size_t CPUManagedStreams::CPUToFPGADriver::push(void *src,
|
|||
// implement non-multiples of 512b. The FPGA-side queue will take on the
|
||||
// high-order bytes of the final beat in the transaction, and the strobe is
|
||||
// not respected. So put the assertion here and discuss what to do next.
|
||||
assert((num_bytes % beat_bytes()) == 0);
|
||||
assert((num_bytes % fpga_buffer_width_bytes()) == 0);
|
||||
|
||||
auto num_beats = num_bytes / beat_bytes();
|
||||
auto threshold_beats = required_bytes / beat_bytes();
|
||||
auto num_beats = num_bytes / fpga_buffer_width_bytes();
|
||||
auto threshold_beats = required_bytes / fpga_buffer_width_bytes();
|
||||
|
||||
assert(threshold_beats <= fpga_buffer_size());
|
||||
auto space_available = fpga_buffer_size() - mmio_read(count_addr());
|
||||
|
@ -37,7 +37,7 @@ size_t CPUManagedStreams::CPUToFPGADriver::push(void *src,
|
|||
}
|
||||
|
||||
auto push_beats = std::min(space_available, num_beats);
|
||||
auto push_bytes = push_beats * beat_bytes();
|
||||
auto push_bytes = push_beats * fpga_buffer_width_bytes();
|
||||
auto bytes_written =
|
||||
cpu_managed_axi4_write(dma_addr(), (char *)src, push_bytes);
|
||||
assert(bytes_written == push_bytes);
|
||||
|
@ -70,10 +70,10 @@ size_t CPUManagedStreams::FPGAToCPUDriver::pull(void *dest,
|
|||
// Due to the destructive nature of reads, if we wish to support reads that
|
||||
// aren't a multiple of 512b, we'll need to keep a little buffer around for
|
||||
// the remainder, and prepend this to the destination buffer.
|
||||
assert((num_bytes % beat_bytes()) == 0);
|
||||
assert((num_bytes % fpga_buffer_width_bytes()) == 0);
|
||||
|
||||
auto num_beats = num_bytes / beat_bytes();
|
||||
auto threshold_beats = required_bytes / beat_bytes();
|
||||
auto num_beats = num_bytes / fpga_buffer_width_bytes();
|
||||
auto threshold_beats = required_bytes / fpga_buffer_width_bytes();
|
||||
|
||||
assert(threshold_beats <= fpga_buffer_size());
|
||||
auto count = mmio_read(count_addr());
|
||||
|
@ -83,7 +83,7 @@ size_t CPUManagedStreams::FPGAToCPUDriver::pull(void *dest,
|
|||
}
|
||||
|
||||
auto pull_beats = std::min(count, num_beats);
|
||||
auto pull_bytes = pull_beats * beat_bytes();
|
||||
auto pull_bytes = pull_beats * fpga_buffer_width_bytes();
|
||||
auto bytes_read = cpu_managed_axi4_read(dma_addr(), (char *)dest, pull_bytes);
|
||||
assert(bytes_read == pull_bytes);
|
||||
return bytes_read;
|
||||
|
|
|
@ -57,13 +57,16 @@ struct StreamParameters {
|
|||
uint64_t dma_addr;
|
||||
uint64_t count_addr;
|
||||
uint32_t fpga_buffer_size;
|
||||
uint32_t fpga_buffer_width_bytes;
|
||||
|
||||
StreamParameters(const std::string &stream_name,
|
||||
uint64_t dma_addr,
|
||||
uint64_t count_addr,
|
||||
int fpga_buffer_size)
|
||||
int fpga_buffer_size,
|
||||
int fpga_buffer_width_bytes)
|
||||
: stream_name(stream_name), dma_addr(dma_addr), count_addr(count_addr),
|
||||
fpga_buffer_size(fpga_buffer_size) {}
|
||||
fpga_buffer_size(fpga_buffer_size),
|
||||
fpga_buffer_width_bytes(fpga_buffer_width_bytes) {}
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -106,7 +109,7 @@ public:
|
|||
int fpga_buffer_size() { return params.fpga_buffer_size; };
|
||||
uint64_t dma_addr() { return params.dma_addr; };
|
||||
uint64_t count_addr() { return params.count_addr; };
|
||||
uint64_t beat_bytes() const { return io.get_beat_bytes(); }
|
||||
uint64_t fpga_buffer_width_bytes() const { return params.fpga_buffer_width_bytes; }
|
||||
};
|
||||
|
||||
/**
|
||||
|
|
|
@ -128,7 +128,7 @@ class NitefuryConfig extends Config(new Config((site, here, up) => {
|
|||
case StreamEngineInstantiatorKey => (e: StreamEngineParameters, p: Parameters) => new CPUManagedStreamEngine(p, e)
|
||||
case CPUManagedAXI4Key => Some(CPUManagedAXI4Params(
|
||||
addrBits = 64,
|
||||
dataBits = 512,
|
||||
dataBits = 128,
|
||||
idBits = 4,
|
||||
))
|
||||
case FPGAManagedAXI4Key => None
|
||||
|
|
|
@ -14,6 +14,54 @@ import freechips.rocketchip.util.DecoupledHelper
|
|||
import midas.targetutils.xdc
|
||||
import midas.widgets._
|
||||
|
||||
class StreamAdapterIO(val w: Int) extends Bundle {
|
||||
val in = Flipped(Decoupled(UInt(w.W)))
|
||||
val out = Decoupled(UInt(w.W))
|
||||
|
||||
def flipConnect(other: StreamAdapterIO) {
|
||||
in <> other.out
|
||||
other.in <> out
|
||||
}
|
||||
}
|
||||
|
||||
class StreamWidthAdapter(narrowW: Int, wideW: Int) extends Module {
|
||||
require(wideW >= narrowW)
|
||||
require(wideW % narrowW == 0)
|
||||
val io = IO(new Bundle {
|
||||
val narrow = new StreamAdapterIO(narrowW)
|
||||
val wide = new StreamAdapterIO(wideW)
|
||||
})
|
||||
|
||||
if (wideW == narrowW) {
|
||||
io.narrow.out <> io.wide.in
|
||||
io.wide.out <> io.narrow.in
|
||||
} else {
|
||||
val beats = wideW / narrowW
|
||||
|
||||
val narrow_beats = RegInit(0.U(log2Ceil(beats).W))
|
||||
val narrow_last_beat = narrow_beats === (beats-1).U
|
||||
val narrow_data = Reg(Vec(beats-1, UInt(narrowW.W)))
|
||||
|
||||
val wide_beats = RegInit(0.U(log2Ceil(beats).W))
|
||||
val wide_last_beat = wide_beats === (beats-1).U
|
||||
|
||||
io.narrow.in.ready := Mux(narrow_last_beat, io.wide.out.ready, true.B)
|
||||
when (io.narrow.in.fire()) {
|
||||
narrow_beats := Mux(narrow_last_beat, 0.U, narrow_beats + 1.U)
|
||||
when (!narrow_last_beat) { narrow_data(narrow_beats) := io.narrow.in.bits }
|
||||
}
|
||||
io.wide.out.valid := narrow_last_beat && io.narrow.in.valid
|
||||
io.wide.out.bits := Cat(io.narrow.in.bits, narrow_data.asUInt)
|
||||
|
||||
io.narrow.out.valid := io.wide.in.valid
|
||||
io.narrow.out.bits := io.wide.in.bits >> (wide_beats << 3)
|
||||
when (io.narrow.out.fire()) {
|
||||
wide_beats := Mux(wide_last_beat, 0.U, wide_beats + 1.U)
|
||||
}
|
||||
io.wide.in.ready := wide_last_beat && io.narrow.out.ready
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A helper container to serialize per-stream constants to the header. This is
|
||||
* currently somewhat redundant with the default header emission for widgets.
|
||||
|
@ -22,15 +70,17 @@ case class StreamDriverParameters(
|
|||
name: String,
|
||||
bufferBaseAddress: Int,
|
||||
countMMIOAddress: Int,
|
||||
bufferCapacity: Int)
|
||||
bufferCapacity: Int,
|
||||
bufferWidthBytes: Int)
|
||||
|
||||
class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters) extends StreamEngine(p) {
|
||||
|
||||
val cpuManagedAXI4params = p(CPUManagedAXI4Key).get
|
||||
require(BridgeStreamConstants.streamWidthBits == cpuManagedAXI4params.dataBits,
|
||||
s"CPU-managed AXI4 IF data width must match the stream width: ${BridgeStreamConstants.streamWidthBits}.")
|
||||
require(BridgeStreamConstants.streamWidthBits >= cpuManagedAXI4params.dataBits,
|
||||
s"CPU-managed AXI4 IF data width (${cpuManagedAXI4params.dataBits}) must be less than or equal to the stream width (${BridgeStreamConstants.streamWidthBits}).")
|
||||
|
||||
val beatBytes = cpuManagedAXI4params.dataBits / 8
|
||||
val axiBeatBytes = cpuManagedAXI4params.dataBits / 8
|
||||
val bufferWidthBytes = BridgeStreamConstants.streamWidthBits / 8
|
||||
|
||||
val cpuManagedAXI4NodeOpt = Some(AXI4SlaveNode(
|
||||
Seq(AXI4SlavePortParameters(
|
||||
|
@ -39,10 +89,10 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters)
|
|||
resources = (new MemoryDevice).reg,
|
||||
regionType = RegionType.UNCACHED, // cacheable
|
||||
executable = false,
|
||||
supportsWrite = TransferSizes(beatBytes, 4096),
|
||||
supportsRead = TransferSizes(beatBytes, 4096),
|
||||
supportsWrite = TransferSizes(axiBeatBytes, 4096),
|
||||
supportsRead = TransferSizes(axiBeatBytes, 4096),
|
||||
interleavedId = Some(0))), // slave does not interleave read responses
|
||||
beatBytes = beatBytes)
|
||||
beatBytes = axiBeatBytes)
|
||||
))
|
||||
)
|
||||
|
||||
|
@ -56,8 +106,8 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters)
|
|||
|
||||
// FromHostCPU streams are implemented using the AW, W, B channels, which
|
||||
// write into large BRAM FIFOs for each stream.
|
||||
assert(!axi4.aw.valid || axi4.aw.bits.size === log2Ceil(beatBytes).U)
|
||||
assert(!axi4.w.valid || axi4.w.bits.strb === ~0.U(beatBytes.W))
|
||||
assert(!axi4.aw.valid || axi4.aw.bits.size === log2Ceil(axiBeatBytes).U)
|
||||
assert(!axi4.w.valid || axi4.w.bits.strb === ~0.U(axiBeatBytes.W))
|
||||
|
||||
axi4.b.bits.resp := 0.U(2.W)
|
||||
axi4.b.bits.id := axi4.aw.bits.id
|
||||
|
@ -67,7 +117,6 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters)
|
|||
axi4.aw.ready := false.B
|
||||
axi4.w.ready := false.B
|
||||
|
||||
|
||||
// TODO: Chisel naming prefix to indicate what channel this hw belongs to.
|
||||
// This demultiplexes the AW, W, and B channels onto the decoupled ports representing each stream.
|
||||
def elaborateFromHostCPUStream(
|
||||
|
@ -76,6 +125,12 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters)
|
|||
idx: Int,
|
||||
addressSpaceBits: Int): StreamDriverParameters = prefix(chParams.name) {
|
||||
|
||||
val ser_des = Module(new StreamWidthAdapter(cpuManagedAXI4params.dataBits, BridgeStreamConstants.streamWidthBits))
|
||||
// unused
|
||||
ser_des.io.wide.in.bits := 0.U
|
||||
ser_des.io.wide.in.valid := false.B
|
||||
ser_des.io.narrow.out.ready := false.B
|
||||
|
||||
val streamName = chParams.name
|
||||
val grant = (axi4.aw.bits.addr >> addressSpaceBits) === idx.U
|
||||
|
||||
|
@ -88,11 +143,15 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters)
|
|||
val countAddr =
|
||||
attach(incomingQueue.io.count, s"${chParams.name}_count", ReadOnly, substruct = false)
|
||||
|
||||
incomingQueue.io.enq.bits := ser_des.io.wide.out.bits
|
||||
incomingQueue.io.enq.valid := ser_des.io.wide.out.valid
|
||||
ser_des.io.wide.out.ready := incomingQueue.io.enq.ready
|
||||
|
||||
val writeHelper = DecoupledHelper(
|
||||
axi4.aw.valid,
|
||||
axi4.w.valid,
|
||||
axi4.b.ready,
|
||||
incomingQueue.io.enq.ready
|
||||
ser_des.io.narrow.in.ready
|
||||
)
|
||||
|
||||
// TODO: Get rid of this magic number.
|
||||
|
@ -108,18 +167,19 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters)
|
|||
axi4.b.valid := writeHelper.fire(axi4.b.ready, lastWriteBeat)
|
||||
}
|
||||
|
||||
incomingQueue.io.enq.valid := grant && writeHelper.fire(incomingQueue.io.enq.ready)
|
||||
incomingQueue.io.enq.bits := axi4.w.bits.data
|
||||
ser_des.io.narrow.in.valid := grant && writeHelper.fire(ser_des.io.narrow.in.ready)
|
||||
ser_des.io.narrow.in.bits := axi4.w.bits.data
|
||||
|
||||
StreamDriverParameters(
|
||||
chParams.name,
|
||||
idx * (1 << addressSpaceBits),
|
||||
countAddr,
|
||||
chParams.fpgaBufferDepth
|
||||
chParams.fpgaBufferDepth,
|
||||
chParams.fpgaBufferWidthBytes
|
||||
)
|
||||
}
|
||||
|
||||
assert(!axi4.ar.valid || axi4.ar.bits.size === log2Ceil(beatBytes).U)
|
||||
assert(!axi4.ar.valid || axi4.ar.bits.size === log2Ceil(axiBeatBytes).U)
|
||||
|
||||
axi4.r.bits.resp := 0.U(2.W)
|
||||
axi4.r.bits.id := axi4.ar.bits.id
|
||||
|
@ -134,6 +194,13 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters)
|
|||
idx: Int,
|
||||
addressSpaceBits: Int): StreamDriverParameters = prefix(chParams.name) {
|
||||
|
||||
|
||||
val ser_des = Module(new StreamWidthAdapter(cpuManagedAXI4params.dataBits, BridgeStreamConstants.streamWidthBits))
|
||||
// unused
|
||||
ser_des.io.narrow.in.bits := 0.U
|
||||
ser_des.io.narrow.in.valid := false.B
|
||||
ser_des.io.wide.out.ready := false.B
|
||||
|
||||
val grant = (axi4.ar.bits.addr >> addressSpaceBits) === idx.U
|
||||
|
||||
val outgoingQueue = Module(new BRAMQueue(chParams.fpgaBufferDepth)(UInt(BridgeStreamConstants.streamWidthBits.W)))
|
||||
|
@ -141,6 +208,10 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters)
|
|||
|
||||
outgoingQueue.io.enq <> channel
|
||||
|
||||
ser_des.io.wide.in.bits := outgoingQueue.io.deq.bits
|
||||
ser_des.io.wide.in.valid := outgoingQueue.io.deq.valid
|
||||
outgoingQueue.io.deq.ready := ser_des.io.wide.in.ready
|
||||
|
||||
// check to see if axi4 has valid output instead of waiting for timeouts
|
||||
val countAddr =
|
||||
attach(outgoingQueue.io.count, s"${chParams.name}_count", ReadOnly, substruct = false)
|
||||
|
@ -148,7 +219,7 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters)
|
|||
val readHelper = DecoupledHelper(
|
||||
axi4.ar.valid,
|
||||
axi4.r.ready,
|
||||
outgoingQueue.io.deq.valid
|
||||
ser_des.io.narrow.out.valid
|
||||
)
|
||||
|
||||
val readBeatCounter = RegInit(0.U(9.W))
|
||||
|
@ -157,11 +228,11 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters)
|
|||
readBeatCounter := Mux(lastReadBeat, 0.U, readBeatCounter + 1.U)
|
||||
}
|
||||
|
||||
outgoingQueue.io.deq.ready := grant && readHelper.fire(outgoingQueue.io.deq.valid)
|
||||
ser_des.io.narrow.out.ready := grant && readHelper.fire(ser_des.io.narrow.out.valid)
|
||||
|
||||
when (grant) {
|
||||
axi4.r.valid := readHelper.fire(axi4.r.ready)
|
||||
axi4.r.bits.data := outgoingQueue.io.deq.bits
|
||||
axi4.r.bits.data := ser_des.io.narrow.out.bits
|
||||
axi4.r.bits.last := lastReadBeat
|
||||
axi4.ar.ready := readHelper.fire(axi4.ar.valid, lastReadBeat)
|
||||
}
|
||||
|
@ -169,7 +240,8 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters)
|
|||
chParams.name,
|
||||
idx * (1 << addressSpaceBits),
|
||||
countAddr,
|
||||
chParams.fpgaBufferDepth)
|
||||
chParams.fpgaBufferDepth,
|
||||
chParams.fpgaBufferWidthBytes)
|
||||
}
|
||||
|
||||
def implementStreams[A <: StreamParameters](
|
||||
|
@ -187,7 +259,7 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters)
|
|||
// fractured into multiple, smaller AXI4 transactions (<= 4K in size), it
|
||||
// is simplest to maintain the illusion that each stream is granted an
|
||||
// address range at least as large as the largest DMA access.
|
||||
def streamASBits = log2Ceil(beatBytes * streamParameters.map(_.fpgaBufferDepth).max)
|
||||
def streamASBits = log2Ceil(bufferWidthBytes * streamParameters.map(_.fpgaBufferDepth).max)
|
||||
|
||||
for (((port, params), idx) <- streamPorts.zip(streamParameters).zipWithIndex) yield {
|
||||
elaborator(port, params, idx, streamASBits)
|
||||
|
@ -206,7 +278,8 @@ class CPUManagedStreamEngine(p: Parameters, val params: StreamEngineParameters)
|
|||
| std::string(${CStrLit(p.name).toC}),
|
||||
| ${UInt64(p.bufferBaseAddress).toC},
|
||||
| ${UInt64(base + p.countMMIOAddress).toC},
|
||||
| ${UInt32(p.bufferCapacity).toC}
|
||||
| ${UInt32(p.bufferCapacity).toC},
|
||||
| ${UInt32(p.bufferWidthBytes).toC}
|
||||
|)""".stripMargin)))
|
||||
}
|
||||
|
||||
|
|
|
@ -26,15 +26,16 @@ trait StreamParameters {
|
|||
def name: String
|
||||
def idx: Int
|
||||
def fpgaBufferDepth: Int
|
||||
def fpgaBufferWidthBytes: Int
|
||||
/**
|
||||
* Pretty prints a description of this stream.
|
||||
*/
|
||||
def summaryString: String =
|
||||
s"${name}, FPGA Buffer Depth: ${fpgaBufferDepth} Beats"
|
||||
s"Name: ${name}, Idx: ${idx}, FPGA Buffer Depth: ${fpgaBufferDepth}, FPGA Buffer Width: ${fpgaBufferWidthBytes}"
|
||||
}
|
||||
|
||||
case class StreamSourceParameters(name: String, idx: Int, fpgaBufferDepth: Int) extends StreamParameters
|
||||
case class StreamSinkParameters (name: String, idx: Int, fpgaBufferDepth: Int) extends StreamParameters
|
||||
case class StreamSourceParameters(name: String, idx: Int, fpgaBufferDepth: Int, fpgaBufferWidthBytes: Int) extends StreamParameters
|
||||
case class StreamSinkParameters (name: String, idx: Int, fpgaBufferDepth: Int, fpgaBufferWidthBytes: Int) extends StreamParameters
|
||||
|
||||
/**
|
||||
* A wrapper class for common arguments to all StreamEngine implementations.
|
||||
|
|
|
@ -58,7 +58,8 @@ trait StreamFromHostCPU { self: Widget =>
|
|||
final def streamSinkParams = StreamSinkParameters(
|
||||
fromHostStreamName,
|
||||
fromHostStreamIdx,
|
||||
fromHostCPUQueueDepth)
|
||||
fromHostCPUQueueDepth,
|
||||
BridgeStreamConstants.streamWidthBits/8)
|
||||
|
||||
|
||||
private val _streamDeq = InModuleBody {
|
||||
|
@ -83,7 +84,8 @@ trait StreamToHostCPU { self: Widget =>
|
|||
final def streamSourceParams = StreamSourceParameters(
|
||||
toHostStreamName,
|
||||
toHostStreamIdx,
|
||||
toHostCPUQueueDepth)
|
||||
toHostCPUQueueDepth,
|
||||
BridgeStreamConstants.streamWidthBits/8)
|
||||
|
||||
private val _streamEnq = InModuleBody {
|
||||
val streamToHostCPU = IO(BridgeStreamConstants.streamChiselType)
|
||||
|
|
Loading…
Reference in New Issue