FPGA-managed bridge stream support in metasimulation (#1181)
* metasim-able FPGA-controlled bridge streams * simif: Add a virtual method to permit doing streamengine init * Remove unneeded vitis kernel def changes * Address some of nandors comments
This commit is contained in:
parent
d74c8d639d
commit
fdb5d6d439
|
@ -29,7 +29,6 @@ project {
|
|||
"glob:**midas/src/main/scala/midas/SynthUnitTests.scala",
|
||||
"glob:**midas/src/main/scala/midas/core/CPUManagedStreamEngine.scala",
|
||||
"glob:**midas/src/main/scala/midas/core/Channel.scala",
|
||||
"glob:**midas/src/main/scala/midas/core/FPGAManagedStreamEngine.scala",
|
||||
"glob:**midas/src/main/scala/midas/core/FPGATop.scala",
|
||||
"glob:**midas/src/main/scala/midas/core/Interfaces.scala",
|
||||
"glob:**midas/src/main/scala/midas/core/LIBDNUnitTest.scala",
|
||||
|
|
|
@ -292,6 +292,7 @@ void tracerv_t::tick() {
|
|||
|
||||
// Pull in any remaining tokens and flush them to file
|
||||
void tracerv_t::flush() {
|
||||
pull_flush(stream_idx);
|
||||
while (this->trace_enabled && (process_tokens(this->stream_depth, 0) > 0))
|
||||
;
|
||||
}
|
||||
|
|
|
@ -72,6 +72,7 @@ protected:
|
|||
return 0;
|
||||
return sim->push(stream_idx, data, size, minimum_batch_size);
|
||||
}
|
||||
void pull_flush(unsigned stream_idx) { return sim->pull_flush(stream_idx); }
|
||||
|
||||
private:
|
||||
simif_t *sim;
|
||||
|
|
|
@ -0,0 +1,22 @@
|
|||
// See LICENSE for license details.
|
||||
|
||||
#ifndef __BRIDGES_BRIDGE_STREAM_DRIVER_H
|
||||
#define __BRIDGES_BRIDGE_STREAM_DRIVER_H
|
||||
|
||||
class FPGAToCPUStreamDriver {
|
||||
public:
|
||||
virtual ~FPGAToCPUStreamDriver(){};
|
||||
virtual void init() = 0;
|
||||
virtual size_t pull(void *dest, size_t num_bytes, size_t required_bytes) = 0;
|
||||
virtual void flush() = 0;
|
||||
};
|
||||
|
||||
class CPUToFPGAStreamDriver {
|
||||
public:
|
||||
virtual ~CPUToFPGAStreamDriver(){};
|
||||
virtual void init() = 0;
|
||||
virtual size_t push(void *src, size_t num_bytes, size_t required_bytes) = 0;
|
||||
virtual void flush() = 0;
|
||||
};
|
||||
|
||||
#endif // __BRIDGES_BRIDGE_STREAM_DRIVER_H
|
|
@ -12,7 +12,9 @@
|
|||
* would be enqueued, this method enqueues none and returns 0.
|
||||
* @return size_t
|
||||
*/
|
||||
size_t StreamFromCPU::push(void *src, size_t num_bytes, size_t required_bytes) {
|
||||
size_t CPUManagedStreams::CPUToFPGADriver::push(void *src,
|
||||
size_t num_bytes,
|
||||
size_t required_bytes) {
|
||||
assert(num_bytes >= required_bytes);
|
||||
|
||||
// Similarly to above, the legacy implementation of DMA does not correctly
|
||||
|
@ -51,7 +53,9 @@ size_t StreamFromCPU::push(void *src, size_t num_bytes, size_t required_bytes) {
|
|||
* would be dequeued, dequeue none and return 0.
|
||||
* @return size_t Number of bytes successfully dequeued
|
||||
*/
|
||||
size_t StreamToCPU::pull(void *dest, size_t num_bytes, size_t required_bytes) {
|
||||
size_t CPUManagedStreams::FPGAToCPUDriver::pull(void *dest,
|
||||
size_t num_bytes,
|
||||
size_t required_bytes) {
|
||||
assert(num_bytes >= required_bytes);
|
||||
|
||||
// The legacy code is clearly broken for requests that aren't a
|
||||
|
|
|
@ -1,29 +1,33 @@
|
|||
// See LICENSE for license details.
|
||||
|
||||
#ifndef __CPU_MANAGED_STREAM_H
|
||||
#define __CPU_MANAGED_STREAM_H
|
||||
#ifndef __BRIDGES_CPU_MANAGED_STREAM_H
|
||||
#define __BRIDGES_CPU_MANAGED_STREAM_H
|
||||
|
||||
#include <functional>
|
||||
#include <string>
|
||||
|
||||
#include "bridge_stream_driver.h"
|
||||
|
||||
namespace CPUManagedStreams {
|
||||
/**
|
||||
* @brief Parameters emitted for a CPU-managed stream emitted by Golden Gate.
|
||||
*
|
||||
* This will be replaced by a protobuf-derived class, and re-used across both
|
||||
* Scala and C++.
|
||||
*/
|
||||
typedef struct CPUManagedStreamParameters {
|
||||
typedef struct StreamParameters {
|
||||
std::string stream_name;
|
||||
uint64_t dma_addr;
|
||||
uint64_t count_addr;
|
||||
uint32_t fpga_buffer_size;
|
||||
|
||||
CPUManagedStreamParameters(std::string stream_name,
|
||||
uint64_t dma_addr,
|
||||
uint64_t count_addr,
|
||||
int fpga_buffer_size)
|
||||
StreamParameters(std::string stream_name,
|
||||
uint64_t dma_addr,
|
||||
uint64_t count_addr,
|
||||
int fpga_buffer_size)
|
||||
: stream_name(stream_name), dma_addr(dma_addr), count_addr(count_addr),
|
||||
fpga_buffer_size(fpga_buffer_size){};
|
||||
} CPUManagedStreamParameters;
|
||||
} StreamParameters;
|
||||
|
||||
/**
|
||||
* @brief Base class for CPU-managed streams
|
||||
|
@ -39,14 +43,15 @@ typedef struct CPUManagedStreamParameters {
|
|||
* FPGA-managed AXI4 for their platform.
|
||||
*
|
||||
*/
|
||||
class CPUManagedStream {
|
||||
class CPUManagedDriver {
|
||||
public:
|
||||
CPUManagedStream(CPUManagedStreamParameters params,
|
||||
CPUManagedDriver(StreamParameters params,
|
||||
std::function<uint32_t(size_t)> mmio_read_func)
|
||||
: params(params), mmio_read_func(mmio_read_func){};
|
||||
virtual ~CPUManagedDriver(){};
|
||||
|
||||
private:
|
||||
CPUManagedStreamParameters params;
|
||||
StreamParameters params;
|
||||
std::function<uint32_t(size_t)> mmio_read_func;
|
||||
|
||||
public:
|
||||
|
@ -65,14 +70,20 @@ public:
|
|||
* implemented with axi4_read, and is provided by the host-platform.
|
||||
*
|
||||
*/
|
||||
class StreamToCPU : public CPUManagedStream {
|
||||
class FPGAToCPUDriver final : public CPUManagedDriver,
|
||||
public FPGAToCPUStreamDriver {
|
||||
public:
|
||||
StreamToCPU(CPUManagedStreamParameters params,
|
||||
std::function<uint32_t(size_t)> mmio_read,
|
||||
std::function<size_t(size_t, char *, size_t)> axi4_read)
|
||||
: CPUManagedStream(params, mmio_read), axi4_read(axi4_read){};
|
||||
FPGAToCPUDriver(StreamParameters params,
|
||||
std::function<uint32_t(size_t)> mmio_read,
|
||||
std::function<size_t(size_t, char *, size_t)> axi4_read)
|
||||
: CPUManagedDriver(params, mmio_read), axi4_read(axi4_read){};
|
||||
|
||||
size_t pull(void *dest, size_t num_bytes, size_t required_bytes);
|
||||
virtual size_t
|
||||
pull(void *dest, size_t num_bytes, size_t required_bytes) override;
|
||||
// The CPU-managed stream engine makes all beats available to the bridge,
|
||||
// hence the NOP.
|
||||
virtual void flush() override{};
|
||||
virtual void init() override{};
|
||||
|
||||
private:
|
||||
std::function<size_t(size_t, char *, size_t)> axi4_read;
|
||||
|
@ -85,17 +96,24 @@ private:
|
|||
* FPGA out of a user-provided buffer. IO over a CPU-managed AXI4 IF is
|
||||
* implemented with axi4_write, and is provided by the host-platform.
|
||||
*/
|
||||
class StreamFromCPU : public CPUManagedStream {
|
||||
class CPUToFPGADriver final : public CPUManagedDriver,
|
||||
public CPUToFPGAStreamDriver {
|
||||
public:
|
||||
StreamFromCPU(CPUManagedStreamParameters params,
|
||||
std::function<uint32_t(size_t)> mmio_read,
|
||||
std::function<size_t(size_t, char *, size_t)> axi4_write)
|
||||
: CPUManagedStream(params, mmio_read), axi4_write(axi4_write){};
|
||||
CPUToFPGADriver(StreamParameters params,
|
||||
std::function<uint32_t(size_t)> mmio_read,
|
||||
std::function<size_t(size_t, char *, size_t)> axi4_write)
|
||||
: CPUManagedDriver(params, mmio_read), axi4_write(axi4_write){};
|
||||
|
||||
size_t push(void *src, size_t num_bytes, size_t required_bytes);
|
||||
virtual size_t
|
||||
push(void *src, size_t num_bytes, size_t required_bytes) override;
|
||||
// On a push all beats are delivered to the FPGA, so a NOP is sufficient here.
|
||||
virtual void flush() override{};
|
||||
virtual void init() override{};
|
||||
|
||||
private:
|
||||
std::function<size_t(size_t, char *, size_t)> axi4_write;
|
||||
};
|
||||
|
||||
#endif // __CPU_MANAGED_STREAM_H
|
||||
} // namespace CPUManagedStreams
|
||||
|
||||
#endif // __BRIDGES_CPU_MANAGED_STREAM_H
|
||||
|
|
|
@ -0,0 +1,57 @@
|
|||
#include "fpga_managed_stream.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
|
||||
void FPGAManagedStreams::FPGAToCPUDriver::init() {
|
||||
mmio_write(params.toHostPhysAddrHighAddr, (uint32_t)(buffer_base_fpga >> 32));
|
||||
mmio_write(params.toHostPhysAddrLowAddr, (uint32_t)buffer_base_fpga);
|
||||
}
|
||||
/**
|
||||
* @brief Dequeues as much as num_bytes of data from the associated bridge
|
||||
* stream.
|
||||
*
|
||||
* @param dest Buffer into which to copy dequeued stream data
|
||||
* @param num_bytes Bytes of data to dequeue
|
||||
* @param required_bytes Minimum number of bytes to dequeue. If fewer bytes
|
||||
* would be dequeued, dequeue none and return 0.
|
||||
* @return size_t Number of bytes successfully dequeued
|
||||
*/
|
||||
size_t FPGAManagedStreams::FPGAToCPUDriver::pull(void *dest,
|
||||
size_t num_bytes,
|
||||
size_t required_bytes) {
|
||||
assert(num_bytes >= required_bytes);
|
||||
size_t bytes_in_buffer = mmio_read(params.bytesAvailableAddr);
|
||||
if (bytes_in_buffer < required_bytes) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
void *src_addr = (char *)buffer_base + buffer_offset;
|
||||
size_t first_copy_bytes =
|
||||
((buffer_offset + bytes_in_buffer) > params.buffer_capacity)
|
||||
? params.buffer_capacity - buffer_offset
|
||||
: bytes_in_buffer;
|
||||
std::memcpy(dest, src_addr, first_copy_bytes);
|
||||
if (first_copy_bytes < bytes_in_buffer) {
|
||||
std::memcpy((char *)dest + first_copy_bytes,
|
||||
buffer_base,
|
||||
bytes_in_buffer - first_copy_bytes);
|
||||
}
|
||||
buffer_offset = (buffer_offset + bytes_in_buffer) % params.buffer_capacity;
|
||||
mmio_write(params.bytesConsumedAddr, bytes_in_buffer);
|
||||
return bytes_in_buffer;
|
||||
}
|
||||
|
||||
void FPGAManagedStreams::FPGAToCPUDriver::flush() {
|
||||
mmio_write(params.toHostStreamFlushAddr, 1);
|
||||
// TODO: Consider if this should be made non-blocking // alternate API
|
||||
auto flush_done = false;
|
||||
int attempts = 0;
|
||||
while (!flush_done) {
|
||||
flush_done = (mmio_read(params.toHostStreamFlushDoneAddr) & 1);
|
||||
if (++attempts > 256) {
|
||||
exit(1); // Bridge stream flush appears to deadlock
|
||||
};
|
||||
}
|
||||
}
|
|
@ -0,0 +1,88 @@
|
|||
#ifndef __BRIDGES_FPGA_MANAGED_STREAM_H
|
||||
#define __BRIDGES_FPGA_MANAGED_STREAM_H
|
||||
|
||||
// See LICENSE for license details.
|
||||
|
||||
#include <functional>
|
||||
#include <string>
|
||||
|
||||
#include "bridge_stream_driver.h"
|
||||
|
||||
namespace FPGAManagedStreams {
|
||||
/**
|
||||
* @brief Parameters emitted for a FPGA-managed stream emitted by Golden Gate.
|
||||
*
|
||||
* This will be replaced by a protobuf-derived class, and re-used across both
|
||||
* Scala and C++.
|
||||
*/
|
||||
typedef struct StreamParameters {
|
||||
std::string stream_name;
|
||||
uint32_t buffer_capacity;
|
||||
uint64_t toHostPhysAddrHighAddr;
|
||||
uint64_t toHostPhysAddrLowAddr;
|
||||
uint64_t bytesAvailableAddr;
|
||||
uint64_t bytesConsumedAddr;
|
||||
uint64_t toHostStreamDoneInitAddr;
|
||||
uint64_t toHostStreamFlushAddr;
|
||||
uint64_t toHostStreamFlushDoneAddr;
|
||||
|
||||
StreamParameters(std::string stream_name,
|
||||
uint32_t buffer_capacity,
|
||||
uint64_t toHostPhysAddrHighAddr,
|
||||
uint64_t toHostPhysAddrLowAddr,
|
||||
uint64_t bytesAvailableAddr,
|
||||
uint64_t bytesConsumedAddr,
|
||||
uint64_t toHostStreamDoneInitAddr,
|
||||
uint64_t toHostStreamFlushAddr,
|
||||
uint64_t toHostStreamFlushDoneAddr)
|
||||
: stream_name(stream_name), buffer_capacity(buffer_capacity),
|
||||
toHostPhysAddrHighAddr(toHostPhysAddrHighAddr),
|
||||
toHostPhysAddrLowAddr(toHostPhysAddrLowAddr),
|
||||
bytesAvailableAddr(bytesAvailableAddr),
|
||||
bytesConsumedAddr(bytesConsumedAddr),
|
||||
toHostStreamDoneInitAddr(toHostStreamDoneInitAddr),
|
||||
toHostStreamFlushAddr(toHostStreamFlushAddr),
|
||||
toHostStreamFlushDoneAddr(toHostStreamFlushDoneAddr){};
|
||||
} StreamParameters;
|
||||
|
||||
/**
|
||||
* @brief Implements streams sunk by the driver (sourced by the FPGA)
|
||||
*
|
||||
* Extends FPGAManagedStream to provide a pull method, which moves data from the
|
||||
* FPGA into a user-provided buffer. IO over a FPGA-mastered AXI4 IF is
|
||||
* implemented with pcis_read, and is provided by the host-platform.
|
||||
*
|
||||
*/
|
||||
class FPGAToCPUDriver : public FPGAToCPUStreamDriver {
|
||||
public:
|
||||
FPGAToCPUDriver(StreamParameters params,
|
||||
void *buffer_base,
|
||||
uint64_t buffer_base_fpga,
|
||||
std::function<uint32_t(size_t)> mmio_read,
|
||||
std::function<void(size_t, uint32_t)> mmio_write)
|
||||
: params(params), buffer_base(buffer_base),
|
||||
buffer_base_fpga(buffer_base_fpga), mmio_read_func(mmio_read),
|
||||
mmio_write_func(mmio_write){};
|
||||
|
||||
virtual size_t
|
||||
pull(void *dest, size_t num_bytes, size_t required_bytes) override;
|
||||
virtual void flush() override;
|
||||
virtual void init() override;
|
||||
|
||||
size_t mmio_read(size_t addr) { return mmio_read_func(addr); };
|
||||
void mmio_write(size_t addr, uint32_t data) { mmio_write_func(addr, data); };
|
||||
|
||||
private:
|
||||
StreamParameters params;
|
||||
void *buffer_base;
|
||||
uint64_t buffer_base_fpga;
|
||||
std::function<uint32_t(size_t)> mmio_read_func;
|
||||
std::function<void(size_t, uint32_t)> mmio_write_func;
|
||||
|
||||
// A read pointer offset from the base, in bytes
|
||||
int buffer_offset = 0;
|
||||
};
|
||||
|
||||
} // namespace FPGAManagedStreams
|
||||
|
||||
#endif // __BRIDGES_FPGA_MANAGED_STREAM_H
|
|
@ -233,7 +233,7 @@ size_t synthesized_prints_t::process_tokens(size_t beats,
|
|||
// See FireSim issue #208
|
||||
// This needs to be page aligned, as a DMA request that spans a page is
|
||||
// fractured into a pair, and for reasons unknown, first beat of the second
|
||||
// request is lost. Once aligned, qequests larger than a page will be
|
||||
// request is lost. Once aligned, requests larger than a page will be
|
||||
// fractured into page-size (64-beat) requests and these seem to behave
|
||||
// correctly.
|
||||
alignas(4096) char buf[maximum_batch_bytes];
|
||||
|
@ -307,11 +307,14 @@ void synthesized_prints_t::flush() {
|
|||
// empty. It might be safer to put a bound on this though.
|
||||
while (process_tokens(batch_beats, 0) != 0)
|
||||
;
|
||||
pull_flush(stream_idx);
|
||||
process_tokens(batch_beats, 0);
|
||||
|
||||
// If multiple tokens are being packed into a single stream beat, force the
|
||||
// widget to write out any incomplete beat
|
||||
if (token_bytes < beat_bytes) {
|
||||
write(mmio_addrs.flushNarrowPacket, 1);
|
||||
pull_flush(stream_idx);
|
||||
|
||||
// On an FPGA reading from the stream will have enough latency that
|
||||
// process_tokens will return non-zero on the first attempt, introducing no
|
||||
|
|
|
@ -52,6 +52,7 @@ void simif_t::target_init() {
|
|||
if (!fastloadmem && !load_mem_path.empty()) {
|
||||
loadmem.load_mem_from_file(load_mem_path);
|
||||
}
|
||||
host_mmio_init();
|
||||
}
|
||||
|
||||
int simif_t::simulation_run() {
|
||||
|
|
|
@ -76,25 +76,22 @@ protected:
|
|||
*
|
||||
* Historically this god class wrapped all of the features presented by FireSim
|
||||
* / MIDAS-derived simulators. Critically, it declares an interface for
|
||||
interacting with
|
||||
* the host-FPGA, which consist of methods for implementing 32b MMIO (read,
|
||||
* write), and latency-insensitive bridge streams (push, pull). Concrete
|
||||
* subclasses of simif_t must be written for metasimulation and each supported
|
||||
* host plaform. See simif_f1_t for an example.
|
||||
|
||||
* interacting with the host-FPGA, which consist of methods for implementing
|
||||
* 32b MMIO (read, write), and latency-insensitive bridge streams (push, pull).
|
||||
* Concrete subclasses of simif_t must be written for metasimulation and each
|
||||
* supported host plaform. See simif_f1_t for an example.
|
||||
* simif_t also provides a few core functions that are tied to bridges and
|
||||
widgets that
|
||||
* must be present in all simulators:
|
||||
* widgets that must be present in all simulators:
|
||||
*
|
||||
* - To track simulation time, it provides methods to interact with the
|
||||
* ClockBridge. This bridge is solely responsible for defining a schedule of
|
||||
* clock edges to simulate, and must be instantiated in all targets. See
|
||||
actual_tcycle() and hcycle().
|
||||
* Utilities to report performance are based off these measures of time.
|
||||
* actual_tcycle() and hcycle(). Utilities to report performance are based
|
||||
* off these measures of time.
|
||||
*
|
||||
* - To read and write into FPGA DRAM, the LoadMem widget provides a
|
||||
* low-bandwidth side channel via MMIO. See read_mem, write_mem,
|
||||
zero_out_dram.
|
||||
* zero_out_dram.
|
||||
*/
|
||||
class simif_t {
|
||||
public:
|
||||
|
@ -122,6 +119,14 @@ public:
|
|||
|
||||
/** Bridge / Widget MMIO methods */
|
||||
|
||||
/**
|
||||
* @brief Provides a hook to do mmio-related initialization _before_ bridges.
|
||||
*
|
||||
* This permits setting up core simulation widgets (like stream engines) in a
|
||||
* fashion that may vary across different specializations of simif_t.
|
||||
*/
|
||||
virtual void host_mmio_init() = 0;
|
||||
|
||||
/**
|
||||
* @brief 32b MMIO write, issued over the simulation control bus (AXI4-lite).
|
||||
*
|
||||
|
@ -180,6 +185,23 @@ public:
|
|||
void *src,
|
||||
size_t num_bytes,
|
||||
size_t required_bytes) = 0;
|
||||
/**
|
||||
* @brief Hint that a stream should bypass any underlying batching
|
||||
* optimizations.
|
||||
*
|
||||
* A user-directed hint that a stream should bypass any underlying batching
|
||||
* optimizations. This may permit a future pull to read data that may
|
||||
* otherwise remain queued in parts of the host.
|
||||
*
|
||||
* @param stream_no The index of the stream to flush
|
||||
*/
|
||||
virtual void pull_flush(unsigned int stream_no) = 0;
|
||||
/**
|
||||
* @brief Analagous to pull_flush but for CPU-to-FPGA streams
|
||||
*
|
||||
* @param stream_no The index of the stream to flush
|
||||
*/
|
||||
virtual void push_flush(unsigned int stream_no) = 0;
|
||||
|
||||
// End host-platform interface.
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
#include "simif_emul.h"
|
||||
|
||||
#include "bridges/cpu_managed_stream.h"
|
||||
#include "bridges/fpga_managed_stream.h"
|
||||
|
||||
simif_emul_t::simif_emul_t(const std::vector<std::string> &args)
|
||||
: simif_t(args) {
|
||||
|
@ -40,6 +41,7 @@ simif_emul_t::simif_emul_t(const std::vector<std::string> &args)
|
|||
|
||||
using namespace std::placeholders;
|
||||
auto mmio_read_func = std::bind(&simif_emul_t::read, this, _1);
|
||||
auto mmio_write_func = std::bind(&simif_emul_t::write, this, _1, _2);
|
||||
|
||||
#ifdef CPUMANAGEDSTREAMENGINE_0_PRESENT
|
||||
auto cpu_managed_axi4_read_func =
|
||||
|
@ -48,31 +50,69 @@ simif_emul_t::simif_emul_t(const std::vector<std::string> &args)
|
|||
std::bind(&simif_emul_t::cpu_managed_axi4_write, this, _1, _2, _3);
|
||||
|
||||
for (size_t i = 0; i < CPUMANAGEDSTREAMENGINE_0_from_cpu_stream_count; i++) {
|
||||
auto params = CPUManagedStreamParameters(
|
||||
auto params = CPUManagedStreams::StreamParameters(
|
||||
std::string(CPUMANAGEDSTREAMENGINE_0_from_cpu_names[i]),
|
||||
CPUMANAGEDSTREAMENGINE_0_from_cpu_dma_addrs[i],
|
||||
CPUMANAGEDSTREAMENGINE_0_from_cpu_count_addrs[i],
|
||||
CPUMANAGEDSTREAMENGINE_0_from_cpu_buffer_sizes[i]);
|
||||
|
||||
from_host_streams.push_back(
|
||||
StreamFromCPU(params, mmio_read_func, cpu_managed_axi4_write_func));
|
||||
cpu_to_fpga_streams.push_back(
|
||||
std::make_unique<CPUManagedStreams::CPUToFPGADriver>(
|
||||
params, mmio_read_func, cpu_managed_axi4_write_func));
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < CPUMANAGEDSTREAMENGINE_0_to_cpu_stream_count; i++) {
|
||||
auto params = CPUManagedStreamParameters(
|
||||
auto params = CPUManagedStreams::StreamParameters(
|
||||
std::string(CPUMANAGEDSTREAMENGINE_0_to_cpu_names[i]),
|
||||
CPUMANAGEDSTREAMENGINE_0_to_cpu_dma_addrs[i],
|
||||
CPUMANAGEDSTREAMENGINE_0_to_cpu_count_addrs[i],
|
||||
CPUMANAGEDSTREAMENGINE_0_to_cpu_buffer_sizes[i]);
|
||||
|
||||
to_host_streams.push_back(
|
||||
StreamToCPU(params, mmio_read_func, cpu_managed_axi4_read_func));
|
||||
fpga_to_cpu_streams.push_back(
|
||||
std::make_unique<CPUManagedStreams::FPGAToCPUDriver>(
|
||||
params, mmio_read_func, cpu_managed_axi4_read_func));
|
||||
}
|
||||
#endif // CPUMANAGEDSTREAMENGINE_0_PRESENT
|
||||
#ifdef FPGAMANAGEDSTREAMENGINE_0_PRESENT
|
||||
auto fpga_address_memory_base = ((char *)cpu_mem->get_data());
|
||||
auto offset = 0;
|
||||
|
||||
for (size_t i = 0; i < FPGAMANAGEDSTREAMENGINE_0_to_cpu_stream_count; i++) {
|
||||
auto params = FPGAManagedStreams::StreamParameters(
|
||||
std::string(FPGAMANAGEDSTREAMENGINE_0_to_cpu_names[i]),
|
||||
FPGAMANAGEDSTREAMENGINE_0_to_cpu_fpgaBufferDepth[i],
|
||||
FPGAMANAGEDSTREAMENGINE_0_to_cpu_toHostPhysAddrHighAddrs[i],
|
||||
FPGAMANAGEDSTREAMENGINE_0_to_cpu_toHostPhysAddrLowAddrs[i],
|
||||
FPGAMANAGEDSTREAMENGINE_0_to_cpu_bytesAvailableAddrs[i],
|
||||
FPGAMANAGEDSTREAMENGINE_0_to_cpu_bytesConsumedAddrs[i],
|
||||
FPGAMANAGEDSTREAMENGINE_0_to_cpu_toHostStreamDoneInitAddrs[i],
|
||||
FPGAMANAGEDSTREAMENGINE_0_to_cpu_toHostStreamFlushAddrs[i],
|
||||
FPGAMANAGEDSTREAMENGINE_0_to_cpu_toHostStreamFlushDoneAddrs[i]);
|
||||
|
||||
fpga_to_cpu_streams.push_back(
|
||||
std::make_unique<FPGAManagedStreams::FPGAToCPUDriver>(
|
||||
params,
|
||||
(void *)(fpga_address_memory_base + offset),
|
||||
offset,
|
||||
mmio_read_func,
|
||||
mmio_write_func));
|
||||
offset += params.buffer_capacity;
|
||||
}
|
||||
|
||||
#endif // FPGAMANAGEDSTREAMENGINE_0_PRESENT
|
||||
}
|
||||
|
||||
simif_emul_t::~simif_emul_t(){};
|
||||
|
||||
void simif_emul_t::host_mmio_init() {
|
||||
for (auto &stream : this->fpga_to_cpu_streams) {
|
||||
stream->init();
|
||||
}
|
||||
for (auto &stream : this->cpu_to_fpga_streams) {
|
||||
stream->init();
|
||||
}
|
||||
};
|
||||
|
||||
int simif_emul_t::run() {
|
||||
if (fastloadmem && !load_mem_path.empty()) {
|
||||
fprintf(stdout, "[fast loadmem] %s\n", load_mem_path.c_str());
|
||||
|
@ -119,8 +159,8 @@ size_t simif_emul_t::pull(unsigned stream_idx,
|
|||
void *dest,
|
||||
size_t num_bytes,
|
||||
size_t threshold_bytes) {
|
||||
assert(stream_idx < to_host_streams.size());
|
||||
return this->to_host_streams[stream_idx].pull(
|
||||
assert(stream_idx < fpga_to_cpu_streams.size());
|
||||
return this->fpga_to_cpu_streams[stream_idx]->pull(
|
||||
dest, num_bytes, threshold_bytes);
|
||||
}
|
||||
|
||||
|
@ -128,11 +168,21 @@ size_t simif_emul_t::push(unsigned stream_idx,
|
|||
void *src,
|
||||
size_t num_bytes,
|
||||
size_t threshold_bytes) {
|
||||
assert(stream_idx < from_host_streams.size());
|
||||
return this->from_host_streams[stream_idx].push(
|
||||
assert(stream_idx < cpu_to_fpga_streams.size());
|
||||
return this->cpu_to_fpga_streams[stream_idx]->push(
|
||||
src, num_bytes, threshold_bytes);
|
||||
}
|
||||
|
||||
void simif_emul_t::pull_flush(unsigned stream_idx) {
|
||||
assert(stream_idx < fpga_to_cpu_streams.size());
|
||||
return this->fpga_to_cpu_streams[stream_idx]->flush();
|
||||
}
|
||||
|
||||
void simif_emul_t::push_flush(unsigned stream_idx) {
|
||||
assert(stream_idx < cpu_to_fpga_streams.size());
|
||||
return this->cpu_to_fpga_streams[stream_idx]->flush();
|
||||
}
|
||||
|
||||
size_t
|
||||
simif_emul_t::cpu_managed_axi4_read(size_t addr, char *data, size_t size) {
|
||||
ssize_t len = (size - 1) / CPU_MANAGED_AXI4_BEAT_BYTES;
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
#ifndef __SIMIF_EMUL_H
|
||||
#define __SIMIF_EMUL_H
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "bridges/cpu_managed_stream.h"
|
||||
|
@ -21,6 +22,8 @@ public:
|
|||
|
||||
virtual void sim_init() = 0;
|
||||
|
||||
void host_mmio_init() override;
|
||||
|
||||
void write(size_t addr, uint32_t data) override;
|
||||
uint32_t read(size_t addr) override;
|
||||
|
||||
|
@ -32,6 +35,10 @@ public:
|
|||
void *src,
|
||||
size_t num_bytes,
|
||||
size_t threshold_bytes) override;
|
||||
|
||||
void pull_flush(unsigned int stream_no) override;
|
||||
void push_flush(unsigned int stream_no) override;
|
||||
|
||||
/**
|
||||
* @brief Pointers to inter-context (i.e., between VCS/verilator and driver)
|
||||
* AXI4 transaction channels
|
||||
|
@ -89,8 +96,8 @@ protected:
|
|||
// Writes directly into the host DRAM models to initialize them.
|
||||
void load_mems(const char *fname);
|
||||
|
||||
std::vector<StreamToCPU> to_host_streams;
|
||||
std::vector<StreamFromCPU> from_host_streams;
|
||||
std::vector<std::unique_ptr<FPGAToCPUStreamDriver>> fpga_to_cpu_streams;
|
||||
std::vector<std::unique_ptr<CPUToFPGAStreamDriver>> cpu_to_fpga_streams;
|
||||
};
|
||||
|
||||
#endif // __SIMIF_EMUL_H
|
||||
|
|
|
@ -36,25 +36,25 @@ simif_f1_t::simif_f1_t(const std::vector<std::string> &args) : simif_t(args) {
|
|||
std::bind(&simif_f1_t::cpu_managed_axi4_write, this, _1, _2, _3);
|
||||
|
||||
for (int i = 0; i < CPUMANAGEDSTREAMENGINE_0_from_cpu_stream_count; i++) {
|
||||
auto params = CPUManagedStreamParameters(
|
||||
auto params = CPUManagedStreams::StreamParameters(
|
||||
std::string(CPUMANAGEDSTREAMENGINE_0_from_cpu_names[i]),
|
||||
CPUMANAGEDSTREAMENGINE_0_from_cpu_dma_addrs[i],
|
||||
CPUMANAGEDSTREAMENGINE_0_from_cpu_count_addrs[i],
|
||||
CPUMANAGEDSTREAMENGINE_0_from_cpu_buffer_sizes[i]);
|
||||
|
||||
from_host_streams.push_back(
|
||||
StreamFromCPU(params, mmio_read_func, cpu_managed_axi4_write_func));
|
||||
from_host_streams.push_back(CPUManagedStreams::CPUToFPGADriver(
|
||||
params, mmio_read_func, cpu_managed_axi4_write_func));
|
||||
}
|
||||
|
||||
for (int i = 0; i < CPUMANAGEDSTREAMENGINE_0_to_cpu_stream_count; i++) {
|
||||
auto params = CPUManagedStreamParameters(
|
||||
auto params = CPUManagedStreams::StreamParameters(
|
||||
std::string(CPUMANAGEDSTREAMENGINE_0_to_cpu_names[i]),
|
||||
CPUMANAGEDSTREAMENGINE_0_to_cpu_dma_addrs[i],
|
||||
CPUMANAGEDSTREAMENGINE_0_to_cpu_count_addrs[i],
|
||||
CPUMANAGEDSTREAMENGINE_0_to_cpu_buffer_sizes[i]);
|
||||
|
||||
to_host_streams.push_back(
|
||||
StreamToCPU(params, mmio_read_func, cpu_managed_axi4_read_func));
|
||||
to_host_streams.push_back(CPUManagedStreams::FPGAToCPUDriver(
|
||||
params, mmio_read_func, cpu_managed_axi4_read_func));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -14,8 +14,9 @@ public:
|
|||
simif_f1_t(const std::vector<std::string> &args);
|
||||
~simif_f1_t();
|
||||
|
||||
// Unused by F1 since initialization / deinitization is done in the
|
||||
// constructor
|
||||
// Unused since no F1-specific MMIO is required to setup the simulation.
|
||||
void host_mmio_init() override{};
|
||||
|
||||
int run() override { return simulation_run(); }
|
||||
|
||||
void write(size_t addr, uint32_t data) override;
|
||||
|
@ -28,6 +29,10 @@ public:
|
|||
void *src,
|
||||
size_t num_bytes,
|
||||
size_t threshold_bytes) override;
|
||||
|
||||
void pull_flush(unsigned int stream_no) override {}
|
||||
void push_flush(unsigned int stream_no) override {}
|
||||
|
||||
uint32_t is_write_ready();
|
||||
void check_rc(int rc, char *infostr);
|
||||
void fpga_shutdown();
|
||||
|
@ -37,8 +42,8 @@ private:
|
|||
char in_buf[CTRL_BEAT_BYTES];
|
||||
char out_buf[CTRL_BEAT_BYTES];
|
||||
|
||||
std::vector<StreamToCPU> to_host_streams;
|
||||
std::vector<StreamFromCPU> from_host_streams;
|
||||
std::vector<CPUManagedStreams::FPGAToCPUDriver> to_host_streams;
|
||||
std::vector<CPUManagedStreams::CPUToFPGADriver> from_host_streams;
|
||||
|
||||
size_t cpu_managed_axi4_write(size_t addr, char *data, size_t size);
|
||||
size_t cpu_managed_axi4_read(size_t addr, char *data, size_t size);
|
||||
|
|
|
@ -12,8 +12,10 @@ public:
|
|||
simif_vitis_t(const std::vector<std::string> &args);
|
||||
~simif_vitis_t() {}
|
||||
|
||||
// Unused by Vitis since initialization / deinitization is done in the
|
||||
// constructor
|
||||
// Will be used once FPGA-managed AXI4 is fully plumbed through the shim
|
||||
// to setup the FPGAManagedStream engine.
|
||||
void host_mmio_init() override{};
|
||||
|
||||
int run() override { return simulation_run(); }
|
||||
|
||||
void write(size_t addr, uint32_t data) override;
|
||||
|
|
|
@ -5,20 +5,280 @@ package midas.core
|
|||
import chisel3._
|
||||
import chisel3.util._
|
||||
import freechips.rocketchip.amba.axi4._
|
||||
import freechips.rocketchip.config.{Parameters, Field}
|
||||
import freechips.rocketchip.config.{Field, Parameters}
|
||||
import freechips.rocketchip.diplomacy._
|
||||
|
||||
import midas.widgets._
|
||||
import midas.widgets.CppGenerationUtils._
|
||||
|
||||
class WriteMetadata(val numBeatsWidth: Int) extends Bundle {
|
||||
val numBeats = Output(UInt(numBeatsWidth.W))
|
||||
val isFlush = Output(Bool())
|
||||
}
|
||||
|
||||
/**
|
||||
* This is a stub to foreshadow the other implementation
|
||||
*/
|
||||
class FPGAManagedStreamEngine(p: Parameters, val params: StreamEngineParameters) extends StreamEngine(p) {
|
||||
require(sinkParams.isEmpty, "FPGAManagedStreamEngine does not currently support FPGA-sunk streams.")
|
||||
|
||||
// Beats refers to 512b words moving over a stream
|
||||
val pageBytes = 4096
|
||||
val beatBytes = BridgeStreamConstants.streamWidthBits / 8
|
||||
val pageBeats = pageBytes / beatBytes
|
||||
|
||||
def maxFlightForStream(params: StreamSourceParameters): Int =
|
||||
(params.fpgaBufferDepth * beatBytes) / pageBytes
|
||||
|
||||
val cpuManagedAXI4NodeOpt = None
|
||||
val fpgaManagedAXI4NodeOpt = Some(midas.widgets.AXI4TieOff()(p))
|
||||
|
||||
val (fpgaManagedAXI4NodeOpt, toCPUNode) = if (hasStreams) {
|
||||
// The implicit val defined in StreamEngine is not accessible here; Make a
|
||||
// duplicate that can be referenced by diplomatic nodes
|
||||
implicit val pShadow = p
|
||||
val xbar = AXI4Xbar()
|
||||
val toCPUNode = AXI4MasterNode(
|
||||
sourceParams.map { p =>
|
||||
AXI4MasterPortParameters(Seq(AXI4MasterParameters(name = p.name, maxFlight = Some(maxFlightForStream(p)))))
|
||||
}
|
||||
)
|
||||
xbar :=* AXI4Buffer() :=* toCPUNode
|
||||
(Some(xbar), Some(toCPUNode))
|
||||
} else {
|
||||
(None, None)
|
||||
}
|
||||
|
||||
lazy val module = new WidgetImp(this) {
|
||||
val io = IO(new WidgetIO)
|
||||
|
||||
case class ToCPUStreamDriverParameters(
|
||||
name: String,
|
||||
fpgaBufferDepth: Int,
|
||||
toHostPhysAddrHighAddr: Int,
|
||||
toHostPhysAddrLowAddr: Int,
|
||||
bytesAvailableAddr: Int,
|
||||
bytesConsumedAddr: Int,
|
||||
toHostStreamDoneInitAddr: Int,
|
||||
toHostStreamFlushAddr: Int,
|
||||
toHostStreamFlushDoneAddr: Int,
|
||||
)
|
||||
|
||||
// Invoke this in the module implementation
|
||||
def elaborateToHostCPUStream(
|
||||
channel: DecoupledIO[UInt],
|
||||
axi4: AXI4Bundle,
|
||||
chParams: StreamSourceParameters,
|
||||
): ToCPUStreamDriverParameters = {
|
||||
|
||||
require(
|
||||
BridgeStreamConstants.streamWidthBits == axi4.params.dataBits,
|
||||
s"FPGAManagedStreamEngine requires stream widths to match FPGA-managed AXI4 data width",
|
||||
)
|
||||
val cpuBufferDepthBeats = chParams.fpgaBufferDepth
|
||||
require(cpuBufferDepthBeats > pageBeats)
|
||||
val cpuBufferSizeBytes = (1 << log2Ceil(cpuBufferDepthBeats)) * (BridgeStreamConstants.streamWidthBits / 8)
|
||||
// This to simplify the hardware
|
||||
require(isPow2(cpuBufferSizeBytes))
|
||||
|
||||
val toHostPhysAddrHigh = Reg(UInt(32.W))
|
||||
val toHostPhysAddrLow = Reg(UInt(32.W))
|
||||
val bytesConsumedByCPU = RegInit(0.U(log2Ceil(cpuBufferSizeBytes + 1).W))
|
||||
|
||||
// This sets up a double buffer that should give full throughput for a
|
||||
// single stream system. This queue could be grown under a multi-stream system.
|
||||
val outgoingQueue = Module(new BRAMQueue(2 * pageBeats)(UInt(BridgeStreamConstants.streamWidthBits.W)))
|
||||
outgoingQueue.io.enq <> channel
|
||||
|
||||
val writeCredits = RegInit(cpuBufferSizeBytes.U(log2Ceil(cpuBufferSizeBytes + 1).W))
|
||||
val readCredits = RegInit(0.U(log2Ceil(cpuBufferSizeBytes + 1).W))
|
||||
val writePtr = RegInit(0.U(log2Ceil(cpuBufferSizeBytes).W))
|
||||
val doneInit = RegInit(false.B)
|
||||
// Key assumption: write acknowledgements can be used as a synchronization
|
||||
// point, after which the CPU can read new data written into its circular
|
||||
// buffer. This tracks inflight requests, to increment read credits on
|
||||
// write acknowledgement, and to cap maxflight.
|
||||
val inflightBeatCounts = Module(
|
||||
new Queue(new WriteMetadata(log2Ceil(pageBeats + 1)), maxFlightForStream(chParams))
|
||||
)
|
||||
|
||||
val idle :: sendAddress :: sendData :: Nil = Enum(3)
|
||||
val state = RegInit(idle)
|
||||
val beatsToSendMinus1 = RegInit(0.U(log2Ceil(pageBeats).W))
|
||||
|
||||
// Ensure we do not cross page boundaries per AXI4 spec.
|
||||
val beatsToPageBoundary =
|
||||
pageBeats.U - writePtr(log2Ceil(pageBytes) - 1, log2Ceil(beatBytes))
|
||||
assert((beatsToPageBoundary > 0.U) && (beatsToPageBoundary <= (pageBeats.U)))
|
||||
|
||||
// Establish the largest AXI4 write request we can make, by doing a min
|
||||
// reduction over the following bounds:
|
||||
val writeBounds = Seq(
|
||||
outgoingQueue.io.count, // Beats available for enqueue in local FPGA buffer
|
||||
writeCredits >> log2Ceil(beatBytes).U, // Space available in cpu buffer
|
||||
beatsToPageBoundary,
|
||||
) // Length to end of page
|
||||
// NB: BeatsToPageBoundary covers the end of the circular buffer only because
|
||||
// we ensure the buffer size is a multiple of page size
|
||||
|
||||
val writeableBeats = writeBounds.reduce { (a, b) => Mux(a < b, a, b) }
|
||||
val writeableBeatsMinus1 = writeableBeats - 1.U
|
||||
|
||||
// This register resets itself to 0 on cycles it is not set by the host
|
||||
// CPU. If it is non-zero it was written to in the last cycle, and so we
|
||||
// know we can update credits.
|
||||
assert(
|
||||
!doneInit || (!(RegNext(bytesConsumedByCPU) =/= 0.U) || (bytesConsumedByCPU === 0.U)),
|
||||
"Back-to-back MMIO accesses, or incorrect toggling on bytesConsumedByCPU",
|
||||
)
|
||||
when(bytesConsumedByCPU =/= 0.U) {
|
||||
bytesConsumedByCPU := 0.U
|
||||
writeCredits := writeCredits + bytesConsumedByCPU
|
||||
readCredits := readCredits - bytesConsumedByCPU
|
||||
}
|
||||
|
||||
val doFlush, inFlush = RegInit(false.B)
|
||||
val flushBeatsToIssue, flushBeatsToAck = RegInit(0.U(log2Ceil(cpuBufferDepthBeats + 1).W))
|
||||
|
||||
assert(readCredits >= bytesConsumedByCPU, "Driver read more bytes than available in circular buffer.")
|
||||
assert(
|
||||
(writeCredits + bytesConsumedByCPU) <= cpuBufferSizeBytes.U,
|
||||
"Driver granted more write credit than physically allowable.",
|
||||
)
|
||||
|
||||
switch(state) {
|
||||
is(idle) {
|
||||
doFlush := false.B
|
||||
when(doFlush && !inFlush && (outgoingQueue.io.count > 0.U)) {
|
||||
inFlush := true.B
|
||||
flushBeatsToIssue := outgoingQueue.io.count
|
||||
flushBeatsToAck := outgoingQueue.io.count
|
||||
}
|
||||
val start =
|
||||
(inflightBeatCounts.io.enq.ready) &&
|
||||
((flushBeatsToIssue =/= 0.U) || (writeableBeats === beatsToPageBoundary))
|
||||
|
||||
when(start) { state := sendAddress }
|
||||
}
|
||||
is(sendAddress) {
|
||||
when(axi4.aw.fire) {
|
||||
state := sendData
|
||||
beatsToSendMinus1 := writeableBeatsMinus1
|
||||
writePtr := writePtr + (writeableBeats * beatBytes.U)
|
||||
writeCredits := writeCredits + bytesConsumedByCPU - (writeableBeats * beatBytes.U)
|
||||
flushBeatsToIssue := Mux(flushBeatsToIssue < writeableBeats, 0.U, flushBeatsToIssue - writeableBeats)
|
||||
}
|
||||
}
|
||||
is(sendData) {
|
||||
when(axi4.w.fire) {
|
||||
state := Mux(axi4.w.bits.last, idle, sendData)
|
||||
beatsToSendMinus1 := beatsToSendMinus1 - 1.U
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
axi4.aw.valid := (state === sendAddress)
|
||||
axi4.aw.bits.id := 0.U
|
||||
axi4.aw.bits.addr := Cat(toHostPhysAddrHigh, toHostPhysAddrLow) + writePtr
|
||||
axi4.aw.bits.len := writeableBeatsMinus1
|
||||
axi4.aw.bits.size := (log2Ceil(beatBytes)).U
|
||||
// This is assumed but not exposed by the PCIM interface, and is the
|
||||
// default transaction type supported by XDMA-backed AXI4 IFs anyways
|
||||
axi4.aw.bits.burst := AXI4Parameters.BURST_INCR
|
||||
// This to permit intermediate width adapters, etc, to pack narrower
|
||||
// transactions into larger ones, in the event we make this IF narrower than 512b
|
||||
axi4.aw.bits.cache := AXI4Parameters.CACHE_MODIFIABLE
|
||||
// Assume page-sized transfers for now
|
||||
// These fields are unused by F1 PCIM, but pick reasonable default values for future proofing
|
||||
axi4.aw.bits.prot := 0.U // Unpriviledged, secure, data access
|
||||
axi4.aw.bits.qos := 0.U // Default; unused
|
||||
axi4.aw.bits.lock := 0.U // Normal, non-exclusive
|
||||
|
||||
inflightBeatCounts.io.enq.valid := axi4.aw.fire
|
||||
inflightBeatCounts.io.enq.bits.numBeats := writeableBeats
|
||||
inflightBeatCounts.io.enq.bits.isFlush := flushBeatsToIssue =/= 0.U
|
||||
|
||||
axi4.w.valid := (state === sendData) && outgoingQueue.io.deq.valid
|
||||
axi4.w.bits.data := outgoingQueue.io.deq.bits
|
||||
axi4.w.bits.strb := ((BigInt(1) << beatBytes) - 1).U
|
||||
axi4.w.bits.last := beatsToSendMinus1 === 0.U
|
||||
outgoingQueue.io.deq.ready := (state === sendData) && axi4.w.ready
|
||||
|
||||
// Write Response handling
|
||||
axi4.b.ready := true.B
|
||||
|
||||
val ackBeats = inflightBeatCounts.io.deq.bits.numBeats
|
||||
val ackFlush = inflightBeatCounts.io.deq.bits.isFlush
|
||||
when(axi4.b.fire) {
|
||||
readCredits := readCredits + (ackBeats * beatBytes.U) - bytesConsumedByCPU
|
||||
when(ackFlush) {
|
||||
val remainingBeatsToAck = Mux(ackBeats < flushBeatsToAck, flushBeatsToAck - ackBeats, 0.U)
|
||||
flushBeatsToAck := remainingBeatsToAck
|
||||
inFlush := remainingBeatsToAck =/= 0.U
|
||||
}
|
||||
}
|
||||
inflightBeatCounts.io.deq.ready := axi4.b.fire
|
||||
assert(!axi4.b.valid || inflightBeatCounts.io.deq.valid)
|
||||
|
||||
// We only use the write channels to implement FPGA-to-CPU streams
|
||||
axi4.ar.valid := false.B
|
||||
axi4.r.ready := false.B
|
||||
|
||||
// Register Driver-programmable MMIO registers
|
||||
ToCPUStreamDriverParameters(
|
||||
chParams.name,
|
||||
cpuBufferSizeBytes,
|
||||
attach(toHostPhysAddrHigh, s"${chParams.name}_toHostPhysAddrHigh"),
|
||||
attach(toHostPhysAddrLow, s"${chParams.name}_toHostPhysAddrLow"),
|
||||
attach(readCredits, s"${chParams.name}_bytesAvailable", ReadOnly),
|
||||
attach(bytesConsumedByCPU, s"${chParams.name}_bytesConsumed"),
|
||||
attach(doneInit, s"${chParams.name}_toHostStreamDoneInit"),
|
||||
attach(doFlush, s"${chParams.name}_toHostStreamFlush"),
|
||||
attach(!(doFlush || inFlush), s"${chParams.name}_toHostStreamFlushDone", ReadOnly),
|
||||
)
|
||||
}
|
||||
|
||||
val sourceDriverParameters = if (hasStreams) {
|
||||
val axi4Bundles = toCPUNode.get.out.map(_._1)
|
||||
(for (((axi4IF, streamIF), params) <- axi4Bundles.zip(streamsToHostCPU).zip(sourceParams)) yield {
|
||||
chisel3.experimental.prefix(params.name) {
|
||||
elaborateToHostCPUStream(streamIF, axi4IF, params)
|
||||
}
|
||||
}).toSeq
|
||||
} else {
|
||||
Seq()
|
||||
}
|
||||
|
||||
genCRFile()
|
||||
|
||||
override def genHeader(base: BigInt, sb: StringBuilder) {
|
||||
val headerWidgetName = getWName.toUpperCase
|
||||
super.genHeader(base, sb)
|
||||
|
||||
def serializeStreamParameters(prefix: String, params: Seq[ToCPUStreamDriverParameters]): Unit = {
|
||||
val numStreams = params.size
|
||||
sb.append(genConstStatic(s"${headerWidgetName}_${prefix}_stream_count", UInt32(numStreams)))
|
||||
|
||||
// Hack: avoid emitting a zero-sized array by providing a dummy set of
|
||||
// parameters when no streams are generated. This is a limitation of the
|
||||
// current C emission strategy. Note, the actual number of streams is still reported above.
|
||||
val placeholder = ToCPUStreamDriverParameters("UNUSED", 0, 0, 0, 0, 0, 0, 0, 0)
|
||||
val nonEmptyParams = if (numStreams == 0) Seq(placeholder) else params
|
||||
|
||||
val arraysToEmit = Seq(
|
||||
"names" -> nonEmptyParams.map { p => CStrLit(p.name) },
|
||||
"fpgaBufferDepth" -> nonEmptyParams.map { p => UInt32(p.fpgaBufferDepth) },
|
||||
"toHostPhysAddrHighAddrs" -> nonEmptyParams.map { p => UInt64(base + p.toHostPhysAddrHighAddr) },
|
||||
"toHostPhysAddrLowAddrs" -> nonEmptyParams.map { p => UInt64(base + p.toHostPhysAddrLowAddr) },
|
||||
"bytesAvailableAddrs" -> nonEmptyParams.map { p => UInt64(base + p.bytesAvailableAddr) },
|
||||
"bytesConsumedAddrs" -> nonEmptyParams.map { p => UInt64(base + p.bytesConsumedAddr) },
|
||||
"toHostStreamDoneInitAddrs" -> nonEmptyParams.map { p => UInt64(base + p.toHostStreamDoneInitAddr) },
|
||||
"toHostStreamFlushAddrs" -> nonEmptyParams.map { p => UInt64(base + p.toHostStreamFlushAddr) },
|
||||
"toHostStreamFlushDoneAddrs" -> nonEmptyParams.map { p => UInt64(base + p.toHostStreamFlushDoneAddr) },
|
||||
)
|
||||
|
||||
for ((name, values) <- arraysToEmit) {
|
||||
sb.append(genArray(s"${headerWidgetName}_${prefix}_${name}", values))
|
||||
}
|
||||
}
|
||||
|
||||
serializeStreamParameters("to_cpu", sourceDriverParameters)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -317,8 +317,11 @@ class FPGATop(implicit p: Parameters) extends LazyModule with HasWidgets {
|
|||
beatBytes = params.dataBits / 8)
|
||||
))
|
||||
|
||||
streamingEngine.fpgaManagedAXI4NodeOpt.foreach {
|
||||
node := AXI4IdIndexer(params.idBits) := AXI4Buffer() := _
|
||||
streamingEngine.fpgaManagedAXI4NodeOpt match {
|
||||
case Some(engineNode) =>
|
||||
node := AXI4IdIndexer(params.idBits) := AXI4Buffer() := engineNode
|
||||
case None =>
|
||||
node := AXI4TieOff()
|
||||
}
|
||||
(node, params)
|
||||
}
|
||||
|
|
|
@ -17,7 +17,7 @@ import midas.core.{
|
|||
/**
|
||||
* Bridge Streams serve as means to do bulk transport from BridgeDriver to
|
||||
* BridgeModule and vice versa. Abstractly, they can be thought of as a 512b
|
||||
* wide latency-insensitive channel (i.e., queue).
|
||||
* wide latency-insensitive channel (i.e., a queue with some unknown latency).
|
||||
*
|
||||
* The two mixins in this file implement the two directions of
|
||||
* producer-consumer relationships: [[StreamFromHostCPU]] add a stream in
|
||||
|
|
|
@ -14,6 +14,7 @@ class NoConfig extends Config(Parameters.empty)
|
|||
class BaseMidasExamplesConfig extends Config(
|
||||
new WithDefaultMemModel ++
|
||||
new WithWiringTransform ++
|
||||
new HostDebugFeatures ++
|
||||
new Config((site, here, up) => {
|
||||
case SynthAsserts => true
|
||||
case GenerateMultiCycleRamModels => true
|
||||
|
|
|
@ -6,22 +6,31 @@ import scala.util.matching.Regex
|
|||
import scala.io.Source
|
||||
import org.scalatest.Suites
|
||||
import org.scalatest.matchers.should._
|
||||
import freechips.rocketchip.config.Config
|
||||
|
||||
object BaseConfigs {
|
||||
def f1 = Seq(classOf[DefaultF1Config])
|
||||
def vitis = Seq(classOf[DefaultVitisConfig])
|
||||
}
|
||||
|
||||
abstract class TutorialSuite(
|
||||
val targetName: String, // See GeneratorUtils
|
||||
targetConfigs: String = "NoConfig",
|
||||
platformConfigs: String = "HostDebugFeatures_DefaultF1Config",
|
||||
platformConfigs: Seq[Class[_ <: Config]] = Seq(),
|
||||
tracelen: Int = 8,
|
||||
simulationArgs: Seq[String] = Seq()
|
||||
) extends firesim.TestSuiteCommon with Matchers {
|
||||
|
||||
lazy val basePlatformConfig = BaseConfigs.f1.asInstanceOf[Seq[Class[_ <: Config]]]
|
||||
val backendSimulator = "verilator"
|
||||
def platformConfigString = (platformConfigs ++ basePlatformConfig).map(_.getSimpleName).mkString("_")
|
||||
|
||||
val targetTuple = s"$targetName-$targetConfigs-$platformConfigs"
|
||||
|
||||
val targetTuple = s"$targetName-$targetConfigs-${platformConfigString}"
|
||||
val commonMakeArgs = Seq(s"TARGET_PROJECT=midasexamples",
|
||||
s"DESIGN=$targetName",
|
||||
s"TARGET_CONFIG=${targetConfigs}",
|
||||
s"PLATFORM_CONFIG=${platformConfigs}")
|
||||
s"PLATFORM_CONFIG=${platformConfigString}")
|
||||
|
||||
def run(backend: String,
|
||||
debug: Boolean = false,
|
||||
|
@ -181,14 +190,17 @@ abstract class TutorialSuite(
|
|||
|
||||
//class PointerChaserF1Test extends TutorialSuite(
|
||||
// "PointerChaser", "PointerChaserConfig", simulationArgs = Seq("`cat runtime.conf`"))
|
||||
|
||||
class GCDF1Test extends TutorialSuite("GCD")
|
||||
class GCDVitisTest extends GCDF1Test { override lazy val basePlatformConfig = BaseConfigs.vitis }
|
||||
|
||||
// Hijack Parity to test all of the Midas-level backends
|
||||
class ParityF1Test extends TutorialSuite("Parity") {
|
||||
runTest("verilator", true)
|
||||
runTest("vcs", true)
|
||||
}
|
||||
|
||||
class ParityVitisTest extends TutorialSuite("Parity", platformConfigs = classOf[DefaultVitisConfig].getSimpleName) {
|
||||
class ParityVitisTest extends TutorialSuite("Parity") {
|
||||
override lazy val basePlatformConfig = BaseConfigs.vitis
|
||||
runTest("verilator", true)
|
||||
runTest("vcs", true)
|
||||
}
|
||||
|
@ -254,7 +266,7 @@ class AutoCounterCoverModuleF1Test extends TutorialSuite("AutoCounterCoverModule
|
|||
}
|
||||
class AutoCounterPrintfF1Test extends TutorialSuite("AutoCounterPrintfModule",
|
||||
simulationArgs = Seq("+print-file=synthprinttest.out"),
|
||||
platformConfigs = "AutoCounterPrintf_HostDebugFeatures_DefaultF1Config") {
|
||||
platformConfigs = classOf[AutoCounterPrintf] +: BaseConfigs.f1) {
|
||||
diffSynthesizedLog("synthprinttest.out0", stdoutPrefix = "AUTOCOUNTER_PRINT CYCLE", synthPrefix = "CYCLE")
|
||||
}
|
||||
class AutoCounterGlobalResetConditionF1Test extends TutorialSuite("AutoCounterGlobalResetCondition",
|
||||
|
@ -282,8 +294,12 @@ class AutoCounterGlobalResetConditionF1Test extends TutorialSuite("AutoCounterGl
|
|||
|
||||
class PrintfModuleF1Test extends TutorialSuite("PrintfModule",
|
||||
simulationArgs = Seq("+print-no-cycle-prefix", "+print-file=synthprinttest.out")) {
|
||||
runTest("vcs", true)
|
||||
diffSynthesizedLog("synthprinttest.out0")
|
||||
}
|
||||
|
||||
class PrintfModuleVitisTest extends PrintfModuleF1Test { override lazy val basePlatformConfig = BaseConfigs.vitis }
|
||||
|
||||
class NarrowPrintfModuleF1Test extends TutorialSuite("NarrowPrintfModule",
|
||||
simulationArgs = Seq("+print-no-cycle-prefix", "+print-file=synthprinttest.out")) {
|
||||
diffSynthesizedLog("synthprinttest.out0")
|
||||
|
@ -353,6 +369,8 @@ class MulticlockPrintF1Test extends TutorialSuite("MulticlockPrintfModule",
|
|||
synthLinesToDrop = 4)
|
||||
}
|
||||
|
||||
class MulticlockPrintVitisTest extends MulticlockPrintF1Test { override lazy val basePlatformConfig = BaseConfigs.vitis }
|
||||
|
||||
class MulticlockAutoCounterF1Test extends TutorialSuite("MulticlockAutoCounterModule",
|
||||
simulationArgs = Seq("+autocounter-readrate=1000", "+autocounter-filename-base=autocounter")) {
|
||||
checkAutoCounterCSV("autocounter0.csv", "AUTOCOUNTER_PRINT ")
|
||||
|
@ -395,7 +413,7 @@ class PassthroughModelBridgeSourceTest extends TutorialSuite("PassthroughModelBr
|
|||
class ResetPulseBridgeActiveHighTest extends TutorialSuite(
|
||||
"ResetPulseBridgeTest",
|
||||
// Disable assertion synthesis to rely on native chisel assertions to catch bad behavior
|
||||
platformConfigs = "NoSynthAsserts_HostDebugFeatures_DefaultF1Config",
|
||||
platformConfigs = classOf[NoSynthAsserts] +: BaseConfigs.f1,
|
||||
simulationArgs = Seq(s"+reset-pulse-length0=${ResetPulseBridgeTestConsts.maxPulseLength}")) {
|
||||
runTest(backendSimulator,
|
||||
args = Seq(s"+reset-pulse-length0=${ResetPulseBridgeTestConsts.maxPulseLength + 1}"),
|
||||
|
@ -405,7 +423,7 @@ class ResetPulseBridgeActiveHighTest extends TutorialSuite(
|
|||
class ResetPulseBridgeActiveLowTest extends TutorialSuite(
|
||||
"ResetPulseBridgeTest",
|
||||
targetConfigs = "ResetPulseBridgeActiveLowConfig",
|
||||
platformConfigs = "NoSynthAsserts_HostDebugFeatures_DefaultF1Config",
|
||||
platformConfigs = classOf[NoSynthAsserts] +: BaseConfigs.f1,
|
||||
simulationArgs = Seq(s"+reset-pulse-length0=${ResetPulseBridgeTestConsts.maxPulseLength}")) {
|
||||
runTest(backendSimulator,
|
||||
args = Seq(s"+reset-pulse-length0=${ResetPulseBridgeTestConsts.maxPulseLength + 1}"),
|
||||
|
@ -434,8 +452,7 @@ class CustomConstraintsF1Test extends TutorialSuite("CustomConstraints") {
|
|||
atLeast (1, xdc) should fullyMatch regex "constrain_impl2 \\[reg WRAPPER_INST/CL/firesim_top/.*/dut/r1]".r
|
||||
}
|
||||
}
|
||||
|
||||
// Suite Collections
|
||||
// Midasexample Suite Collections
|
||||
class ChiselExampleDesigns extends Suites(
|
||||
new GCDF1Test,
|
||||
new ParityF1Test,
|
||||
|
@ -499,6 +516,13 @@ class FMRCITests extends Suites(
|
|||
new PassthroughModelBridgeSourceTest,
|
||||
)
|
||||
|
||||
class VitisCITests extends Suites (
|
||||
new GCDVitisTest,
|
||||
new ParityVitisTest,
|
||||
new PrintfModuleVitisTest,
|
||||
new MulticlockPrintVitisTest,
|
||||
)
|
||||
|
||||
// These groups are vestigial from CircleCI container limits
|
||||
class CIGroupA extends Suites(
|
||||
new ChiselExampleDesigns,
|
||||
|
@ -515,5 +539,6 @@ class CIGroupB extends Suites(
|
|||
new firesim.fasedtests.CIGroupB,
|
||||
new firesim.AllMidasUnitTests,
|
||||
new firesim.FailingUnitTests,
|
||||
new FMRCITests
|
||||
new FMRCITests,
|
||||
new VitisCITests
|
||||
)
|
||||
|
|
Loading…
Reference in New Issue