FPGA-managed bridge stream support in metasimulation (#1181)

* metasim-able FPGA-controlled bridge streams * simif: Add a virtual method to permit doing streamengine init * Remove unneeded vitis kernel def changes * Address some of nandors comments
2022-12-24 11:18:03 -05:00 · 2022-12-24 11:18:03 -05:00 · fdb5d6d439
parent d74c8d639d
commit fdb5d6d439
21 changed files with 651 additions and 82 deletions
--- a/sim/.scalafmt.conf
+++ b/sim/.scalafmt.conf
@ -29,7 +29,6 @@ project {
    "glob:**midas/src/main/scala/midas/SynthUnitTests.scala",
    "glob:**midas/src/main/scala/midas/core/CPUManagedStreamEngine.scala",
    "glob:**midas/src/main/scala/midas/core/Channel.scala",
-    "glob:**midas/src/main/scala/midas/core/FPGAManagedStreamEngine.scala",
    "glob:**midas/src/main/scala/midas/core/FPGATop.scala",
    "glob:**midas/src/main/scala/midas/core/Interfaces.scala",
    "glob:**midas/src/main/scala/midas/core/LIBDNUnitTest.scala",
--- a/sim/firesim-lib/src/main/cc/bridges/tracerv.cc
+++ b/sim/firesim-lib/src/main/cc/bridges/tracerv.cc
@ -292,6 +292,7 @@ void tracerv_t::tick() {

 // Pull in any remaining tokens and flush them to file
 void tracerv_t::flush() {
+  pull_flush(stream_idx);
  while (this->trace_enabled && (process_tokens(this->stream_depth, 0) > 0))
    ;
 }
--- a/sim/midas/src/main/cc/bridges/bridge_driver.h
+++ b/sim/midas/src/main/cc/bridges/bridge_driver.h
@ -72,6 +72,7 @@ protected:
      return 0;
    return sim->push(stream_idx, data, size, minimum_batch_size);
  }
+  void pull_flush(unsigned stream_idx) { return sim->pull_flush(stream_idx); }

 private:
  simif_t *sim;
--- a/sim/midas/src/main/cc/bridges/bridge_stream_driver.h
+++ b/sim/midas/src/main/cc/bridges/bridge_stream_driver.h
@ -0,0 +1,22 @@
+// See LICENSE for license details.
+
+#ifndef __BRIDGES_BRIDGE_STREAM_DRIVER_H
+#define __BRIDGES_BRIDGE_STREAM_DRIVER_H
+
+class FPGAToCPUStreamDriver {
+public:
+  virtual ~FPGAToCPUStreamDriver(){};
+  virtual void init() = 0;
+  virtual size_t pull(void *dest, size_t num_bytes, size_t required_bytes) = 0;
+  virtual void flush() = 0;
+};
+
+class CPUToFPGAStreamDriver {
+public:
+  virtual ~CPUToFPGAStreamDriver(){};
+  virtual void init() = 0;
+  virtual size_t push(void *src, size_t num_bytes, size_t required_bytes) = 0;
+  virtual void flush() = 0;
+};
+
+#endif // __BRIDGES_BRIDGE_STREAM_DRIVER_H
--- a/sim/midas/src/main/cc/bridges/cpu_managed_stream.cc
+++ b/sim/midas/src/main/cc/bridges/cpu_managed_stream.cc
@ -12,7 +12,9 @@
 *        would be enqueued, this method enqueues none and returns 0.
 * @return size_t
 */
-size_t StreamFromCPU::push(void *src, size_t num_bytes, size_t required_bytes) {
+size_t CPUManagedStreams::CPUToFPGADriver::push(void *src,
+                                                size_t num_bytes,
+                                                size_t required_bytes) {
  assert(num_bytes >= required_bytes);

  // Similarly to above, the legacy implementation of DMA does not correctly
@ -51,7 +53,9 @@ size_t StreamFromCPU::push(void *src, size_t num_bytes, size_t required_bytes) {
 * would be dequeued, dequeue none and return 0.
 * @return size_t Number of bytes successfully dequeued
 */
-size_t StreamToCPU::pull(void *dest, size_t num_bytes, size_t required_bytes) {
+size_t CPUManagedStreams::FPGAToCPUDriver::pull(void *dest,
+                                                size_t num_bytes,
+                                                size_t required_bytes) {
  assert(num_bytes >= required_bytes);

  // The legacy code is clearly broken for requests that aren't a
--- a/sim/midas/src/main/cc/bridges/cpu_managed_stream.h
+++ b/sim/midas/src/main/cc/bridges/cpu_managed_stream.h
@ -1,29 +1,33 @@
 // See LICENSE for license details.

-#ifndef __CPU_MANAGED_STREAM_H
-#define __CPU_MANAGED_STREAM_H
+#ifndef __BRIDGES_CPU_MANAGED_STREAM_H
+#define __BRIDGES_CPU_MANAGED_STREAM_H

 #include <functional>
 #include <string>
+
+#include "bridge_stream_driver.h"
+
+namespace CPUManagedStreams {
 /**
 * @brief Parameters emitted for a CPU-managed stream emitted by Golden Gate.
 *
 * This will be replaced by a protobuf-derived class, and re-used across both
 * Scala and C++.
 */
-typedef struct CPUManagedStreamParameters {
+typedef struct StreamParameters {
  std::string stream_name;
  uint64_t dma_addr;
  uint64_t count_addr;
  uint32_t fpga_buffer_size;

-  CPUManagedStreamParameters(std::string stream_name,
-                             uint64_t dma_addr,
-                             uint64_t count_addr,
-                             int fpga_buffer_size)
+  StreamParameters(std::string stream_name,
+                   uint64_t dma_addr,
+                   uint64_t count_addr,
+                   int fpga_buffer_size)
      : stream_name(stream_name), dma_addr(dma_addr), count_addr(count_addr),
        fpga_buffer_size(fpga_buffer_size){};
-} CPUManagedStreamParameters;
+} StreamParameters;

 /**
 * @brief Base class for CPU-managed streams
@ -39,14 +43,15 @@ typedef struct CPUManagedStreamParameters {
 * FPGA-managed AXI4 for their platform.
 *
 */
-class CPUManagedStream {
+class CPUManagedDriver {
 public:
-  CPUManagedStream(CPUManagedStreamParameters params,
+  CPUManagedDriver(StreamParameters params,
                   std::function<uint32_t(size_t)> mmio_read_func)
      : params(params), mmio_read_func(mmio_read_func){};
+  virtual ~CPUManagedDriver(){};

 private:
-  CPUManagedStreamParameters params;
+  StreamParameters params;
  std::function<uint32_t(size_t)> mmio_read_func;

 public:
@ -65,14 +70,20 @@ public:
 * implemented with axi4_read, and is provided by the host-platform.
 *
 */
-class StreamToCPU : public CPUManagedStream {
+class FPGAToCPUDriver final : public CPUManagedDriver,
+                              public FPGAToCPUStreamDriver {
 public:
-  StreamToCPU(CPUManagedStreamParameters params,
-              std::function<uint32_t(size_t)> mmio_read,
-              std::function<size_t(size_t, char *, size_t)> axi4_read)
-      : CPUManagedStream(params, mmio_read), axi4_read(axi4_read){};
+  FPGAToCPUDriver(StreamParameters params,
+                  std::function<uint32_t(size_t)> mmio_read,
+                  std::function<size_t(size_t, char *, size_t)> axi4_read)
+      : CPUManagedDriver(params, mmio_read), axi4_read(axi4_read){};

-  size_t pull(void *dest, size_t num_bytes, size_t required_bytes);
+  virtual size_t
+  pull(void *dest, size_t num_bytes, size_t required_bytes) override;
+  // The CPU-managed stream engine makes all beats available to the bridge,
+  // hence the NOP.
+  virtual void flush() override{};
+  virtual void init() override{};

 private:
  std::function<size_t(size_t, char *, size_t)> axi4_read;
@ -85,17 +96,24 @@ private:
 * FPGA out of a user-provided buffer. IO over a CPU-managed AXI4 IF is
 * implemented with axi4_write, and is provided by the host-platform.
 */
-class StreamFromCPU : public CPUManagedStream {
+class CPUToFPGADriver final : public CPUManagedDriver,
+                              public CPUToFPGAStreamDriver {
 public:
-  StreamFromCPU(CPUManagedStreamParameters params,
-                std::function<uint32_t(size_t)> mmio_read,
-                std::function<size_t(size_t, char *, size_t)> axi4_write)
-      : CPUManagedStream(params, mmio_read), axi4_write(axi4_write){};
+  CPUToFPGADriver(StreamParameters params,
+                  std::function<uint32_t(size_t)> mmio_read,
+                  std::function<size_t(size_t, char *, size_t)> axi4_write)
+      : CPUManagedDriver(params, mmio_read), axi4_write(axi4_write){};

-  size_t push(void *src, size_t num_bytes, size_t required_bytes);
+  virtual size_t
+  push(void *src, size_t num_bytes, size_t required_bytes) override;
+  // On a push all beats are delivered to the FPGA, so a NOP is sufficient here.
+  virtual void flush() override{};
+  virtual void init() override{};

 private:
  std::function<size_t(size_t, char *, size_t)> axi4_write;
 };

-#endif // __CPU_MANAGED_STREAM_H
+} // namespace CPUManagedStreams
+
+#endif // __BRIDGES_CPU_MANAGED_STREAM_H
--- a/sim/midas/src/main/cc/bridges/fpga_managed_stream.cc
+++ b/sim/midas/src/main/cc/bridges/fpga_managed_stream.cc
@ -0,0 +1,57 @@
+#include "fpga_managed_stream.h"
+
+#include <assert.h>
+#include <cstring>
+#include <iostream>
+
+void FPGAManagedStreams::FPGAToCPUDriver::init() {
+  mmio_write(params.toHostPhysAddrHighAddr, (uint32_t)(buffer_base_fpga >> 32));
+  mmio_write(params.toHostPhysAddrLowAddr, (uint32_t)buffer_base_fpga);
+}
+/**
+ * @brief Dequeues as much as num_bytes of data from the associated bridge
+ * stream.
+ *
+ * @param dest  Buffer into which to copy dequeued stream data
+ * @param num_bytes  Bytes of data to dequeue
+ * @param required_bytes  Minimum number of bytes to dequeue. If fewer bytes
+ * would be dequeued, dequeue none and return 0.
+ * @return size_t Number of bytes successfully dequeued
+ */
+size_t FPGAManagedStreams::FPGAToCPUDriver::pull(void *dest,
+                                                 size_t num_bytes,
+                                                 size_t required_bytes) {
+  assert(num_bytes >= required_bytes);
+  size_t bytes_in_buffer = mmio_read(params.bytesAvailableAddr);
+  if (bytes_in_buffer < required_bytes) {
+    return 0;
+  }
+
+  void *src_addr = (char *)buffer_base + buffer_offset;
+  size_t first_copy_bytes =
+      ((buffer_offset + bytes_in_buffer) > params.buffer_capacity)
+          ? params.buffer_capacity - buffer_offset
+          : bytes_in_buffer;
+  std::memcpy(dest, src_addr, first_copy_bytes);
+  if (first_copy_bytes < bytes_in_buffer) {
+    std::memcpy((char *)dest + first_copy_bytes,
+                buffer_base,
+                bytes_in_buffer - first_copy_bytes);
+  }
+  buffer_offset = (buffer_offset + bytes_in_buffer) % params.buffer_capacity;
+  mmio_write(params.bytesConsumedAddr, bytes_in_buffer);
+  return bytes_in_buffer;
+}
+
+void FPGAManagedStreams::FPGAToCPUDriver::flush() {
+  mmio_write(params.toHostStreamFlushAddr, 1);
+  // TODO: Consider if this should be made non-blocking // alternate API
+  auto flush_done = false;
+  int attempts = 0;
+  while (!flush_done) {
+    flush_done = (mmio_read(params.toHostStreamFlushDoneAddr) & 1);
+    if (++attempts > 256) {
+      exit(1); // Bridge stream flush appears to deadlock
+    };
+  }
+}
--- a/sim/midas/src/main/cc/bridges/fpga_managed_stream.h
+++ b/sim/midas/src/main/cc/bridges/fpga_managed_stream.h
@ -0,0 +1,88 @@
+#ifndef __BRIDGES_FPGA_MANAGED_STREAM_H
+#define __BRIDGES_FPGA_MANAGED_STREAM_H
+
+// See LICENSE for license details.
+
+#include <functional>
+#include <string>
+
+#include "bridge_stream_driver.h"
+
+namespace FPGAManagedStreams {
+/**
+ * @brief Parameters emitted for a FPGA-managed stream emitted by Golden Gate.
+ *
+ * This will be replaced by a protobuf-derived class, and re-used across both
+ * Scala and C++.
+ */
+typedef struct StreamParameters {
+  std::string stream_name;
+  uint32_t buffer_capacity;
+  uint64_t toHostPhysAddrHighAddr;
+  uint64_t toHostPhysAddrLowAddr;
+  uint64_t bytesAvailableAddr;
+  uint64_t bytesConsumedAddr;
+  uint64_t toHostStreamDoneInitAddr;
+  uint64_t toHostStreamFlushAddr;
+  uint64_t toHostStreamFlushDoneAddr;
+
+  StreamParameters(std::string stream_name,
+                   uint32_t buffer_capacity,
+                   uint64_t toHostPhysAddrHighAddr,
+                   uint64_t toHostPhysAddrLowAddr,
+                   uint64_t bytesAvailableAddr,
+                   uint64_t bytesConsumedAddr,
+                   uint64_t toHostStreamDoneInitAddr,
+                   uint64_t toHostStreamFlushAddr,
+                   uint64_t toHostStreamFlushDoneAddr)
+      : stream_name(stream_name), buffer_capacity(buffer_capacity),
+        toHostPhysAddrHighAddr(toHostPhysAddrHighAddr),
+        toHostPhysAddrLowAddr(toHostPhysAddrLowAddr),
+        bytesAvailableAddr(bytesAvailableAddr),
+        bytesConsumedAddr(bytesConsumedAddr),
+        toHostStreamDoneInitAddr(toHostStreamDoneInitAddr),
+        toHostStreamFlushAddr(toHostStreamFlushAddr),
+        toHostStreamFlushDoneAddr(toHostStreamFlushDoneAddr){};
+} StreamParameters;
+
+/**
+ * @brief Implements streams sunk by the driver (sourced by the FPGA)
+ *
+ * Extends FPGAManagedStream to provide a pull method, which moves data from the
+ * FPGA into a user-provided buffer. IO over a FPGA-mastered AXI4 IF is
+ * implemented with pcis_read, and is provided by the host-platform.
+ *
+ */
+class FPGAToCPUDriver : public FPGAToCPUStreamDriver {
+public:
+  FPGAToCPUDriver(StreamParameters params,
+                  void *buffer_base,
+                  uint64_t buffer_base_fpga,
+                  std::function<uint32_t(size_t)> mmio_read,
+                  std::function<void(size_t, uint32_t)> mmio_write)
+      : params(params), buffer_base(buffer_base),
+        buffer_base_fpga(buffer_base_fpga), mmio_read_func(mmio_read),
+        mmio_write_func(mmio_write){};
+
+  virtual size_t
+  pull(void *dest, size_t num_bytes, size_t required_bytes) override;
+  virtual void flush() override;
+  virtual void init() override;
+
+  size_t mmio_read(size_t addr) { return mmio_read_func(addr); };
+  void mmio_write(size_t addr, uint32_t data) { mmio_write_func(addr, data); };
+
+private:
+  StreamParameters params;
+  void *buffer_base;
+  uint64_t buffer_base_fpga;
+  std::function<uint32_t(size_t)> mmio_read_func;
+  std::function<void(size_t, uint32_t)> mmio_write_func;
+
+  // A read pointer offset from the base, in bytes
+  int buffer_offset = 0;
+};
+
+} // namespace FPGAManagedStreams
+
+#endif // __BRIDGES_FPGA_MANAGED_STREAM_H
--- a/sim/midas/src/main/cc/bridges/synthesized_prints.cc
+++ b/sim/midas/src/main/cc/bridges/synthesized_prints.cc
@ -233,7 +233,7 @@ size_t synthesized_prints_t::process_tokens(size_t beats,
  // See FireSim issue #208
  // This needs to be page aligned, as a DMA request that spans a page is
  // fractured into a pair, and for reasons unknown, first beat of the second
-  // request is lost. Once aligned, qequests larger than a page will be
+  // request is lost. Once aligned, requests larger than a page will be
  // fractured into page-size (64-beat) requests and these seem to behave
  // correctly.
  alignas(4096) char buf[maximum_batch_bytes];
@ -307,11 +307,14 @@ void synthesized_prints_t::flush() {
  // empty. It might be safer to put a bound on this though.
  while (process_tokens(batch_beats, 0) != 0)
    ;
+  pull_flush(stream_idx);
+  process_tokens(batch_beats, 0);

  // If multiple tokens are being packed into a single stream beat, force the
  // widget to write out any incomplete beat
  if (token_bytes < beat_bytes) {
    write(mmio_addrs.flushNarrowPacket, 1);
+    pull_flush(stream_idx);

    // On an FPGA reading from the stream will have enough latency that
    // process_tokens will return non-zero on the first attempt, introducing no
--- a/sim/midas/src/main/cc/simif.cc
+++ b/sim/midas/src/main/cc/simif.cc
@ -52,6 +52,7 @@ void simif_t::target_init() {
  if (!fastloadmem && !load_mem_path.empty()) {
    loadmem.load_mem_from_file(load_mem_path);
  }
+  host_mmio_init();
 }

 int simif_t::simulation_run() {
--- a/sim/midas/src/main/cc/simif.h
+++ b/sim/midas/src/main/cc/simif.h
@ -76,25 +76,22 @@ protected:
 *
 *  Historically this god class wrapped all of the features presented by FireSim
 *  / MIDAS-derived simulators. Critically, it declares an interface for
- interacting with
- *  the host-FPGA, which consist of methods for implementing 32b MMIO (read,
- *  write), and latency-insensitive bridge streams (push, pull). Concrete
- *  subclasses of simif_t must be written for metasimulation and each supported
- *  host plaform. See simif_f1_t for an example.
-
+ *  interacting with the host-FPGA, which consist of methods for implementing
+ *  32b MMIO (read, write), and latency-insensitive bridge streams (push, pull).
+ *  Concrete subclasses of simif_t must be written for metasimulation and each
+ *  supported host plaform. See simif_f1_t for an example.
 *  simif_t also provides a few core functions that are tied to bridges and
- widgets that
- *  must be present in all simulators:
+ *  widgets that must be present in all simulators:
 *
 *  - To track simulation time, it provides methods to interact with the
 *    ClockBridge. This bridge is solely responsible for defining a schedule of
 *    clock edges to simulate, and must be instantiated in all targets. See
- actual_tcycle() and hcycle().
- *    Utilities to report performance are based off these measures of time.
+ *    actual_tcycle() and hcycle().  Utilities to report performance are based
+ *    off these measures of time.
 *
 *  - To read and write into FPGA DRAM, the LoadMem widget provides a
 *    low-bandwidth side channel via MMIO. See read_mem, write_mem,
- zero_out_dram.
+ *    zero_out_dram.
 */
 class simif_t {
 public:
@ -122,6 +119,14 @@ public:

  /** Bridge / Widget MMIO methods */

+  /**
+   * @brief Provides a hook to do mmio-related initialization _before_ bridges.
+   *
+   * This permits setting up core simulation widgets (like stream engines) in a
+   * fashion that may vary across different specializations of simif_t.
+   */
+  virtual void host_mmio_init() = 0;
+
  /**
   * @brief 32b MMIO write, issued over the simulation control bus (AXI4-lite).
   *
@ -180,6 +185,23 @@ public:
                      void *src,
                      size_t num_bytes,
                      size_t required_bytes) = 0;
+  /**
+   * @brief Hint that a stream should bypass any underlying batching
+   * optimizations.
+   *
+   * A user-directed hint that a stream should bypass any underlying batching
+   * optimizations. This may permit a future pull to read data that may
+   * otherwise remain queued in parts of the host.
+   *
+   * @param stream_no The index of the stream to flush
+   */
+  virtual void pull_flush(unsigned int stream_no) = 0;
+  /**
+   * @brief Analagous to pull_flush but for CPU-to-FPGA streams
+   *
+   * @param stream_no The index of the stream to flush
+   */
+  virtual void push_flush(unsigned int stream_no) = 0;

  // End host-platform interface.

--- a/sim/midas/src/main/cc/simif_emul.cc
+++ b/sim/midas/src/main/cc/simif_emul.cc
@ -3,6 +3,7 @@
 #include "simif_emul.h"

 #include "bridges/cpu_managed_stream.h"
+#include "bridges/fpga_managed_stream.h"

 simif_emul_t::simif_emul_t(const std::vector<std::string> &args)
    : simif_t(args) {
@ -40,6 +41,7 @@ simif_emul_t::simif_emul_t(const std::vector<std::string> &args)

  using namespace std::placeholders;
  auto mmio_read_func = std::bind(&simif_emul_t::read, this, _1);
+  auto mmio_write_func = std::bind(&simif_emul_t::write, this, _1, _2);

 #ifdef CPUMANAGEDSTREAMENGINE_0_PRESENT
  auto cpu_managed_axi4_read_func =
@ -48,31 +50,69 @@ simif_emul_t::simif_emul_t(const std::vector<std::string> &args)
      std::bind(&simif_emul_t::cpu_managed_axi4_write, this, _1, _2, _3);

  for (size_t i = 0; i < CPUMANAGEDSTREAMENGINE_0_from_cpu_stream_count; i++) {
-    auto params = CPUManagedStreamParameters(
+    auto params = CPUManagedStreams::StreamParameters(
        std::string(CPUMANAGEDSTREAMENGINE_0_from_cpu_names[i]),
        CPUMANAGEDSTREAMENGINE_0_from_cpu_dma_addrs[i],
        CPUMANAGEDSTREAMENGINE_0_from_cpu_count_addrs[i],
        CPUMANAGEDSTREAMENGINE_0_from_cpu_buffer_sizes[i]);

-    from_host_streams.push_back(
-        StreamFromCPU(params, mmio_read_func, cpu_managed_axi4_write_func));
+    cpu_to_fpga_streams.push_back(
+        std::make_unique<CPUManagedStreams::CPUToFPGADriver>(
+            params, mmio_read_func, cpu_managed_axi4_write_func));
  }

  for (size_t i = 0; i < CPUMANAGEDSTREAMENGINE_0_to_cpu_stream_count; i++) {
-    auto params = CPUManagedStreamParameters(
+    auto params = CPUManagedStreams::StreamParameters(
        std::string(CPUMANAGEDSTREAMENGINE_0_to_cpu_names[i]),
        CPUMANAGEDSTREAMENGINE_0_to_cpu_dma_addrs[i],
        CPUMANAGEDSTREAMENGINE_0_to_cpu_count_addrs[i],
        CPUMANAGEDSTREAMENGINE_0_to_cpu_buffer_sizes[i]);

-    to_host_streams.push_back(
-        StreamToCPU(params, mmio_read_func, cpu_managed_axi4_read_func));
+    fpga_to_cpu_streams.push_back(
+        std::make_unique<CPUManagedStreams::FPGAToCPUDriver>(
+            params, mmio_read_func, cpu_managed_axi4_read_func));
  }
 #endif // CPUMANAGEDSTREAMENGINE_0_PRESENT
+#ifdef FPGAMANAGEDSTREAMENGINE_0_PRESENT
+  auto fpga_address_memory_base = ((char *)cpu_mem->get_data());
+  auto offset = 0;
+
+  for (size_t i = 0; i < FPGAMANAGEDSTREAMENGINE_0_to_cpu_stream_count; i++) {
+    auto params = FPGAManagedStreams::StreamParameters(
+        std::string(FPGAMANAGEDSTREAMENGINE_0_to_cpu_names[i]),
+        FPGAMANAGEDSTREAMENGINE_0_to_cpu_fpgaBufferDepth[i],
+        FPGAMANAGEDSTREAMENGINE_0_to_cpu_toHostPhysAddrHighAddrs[i],
+        FPGAMANAGEDSTREAMENGINE_0_to_cpu_toHostPhysAddrLowAddrs[i],
+        FPGAMANAGEDSTREAMENGINE_0_to_cpu_bytesAvailableAddrs[i],
+        FPGAMANAGEDSTREAMENGINE_0_to_cpu_bytesConsumedAddrs[i],
+        FPGAMANAGEDSTREAMENGINE_0_to_cpu_toHostStreamDoneInitAddrs[i],
+        FPGAMANAGEDSTREAMENGINE_0_to_cpu_toHostStreamFlushAddrs[i],
+        FPGAMANAGEDSTREAMENGINE_0_to_cpu_toHostStreamFlushDoneAddrs[i]);
+
+    fpga_to_cpu_streams.push_back(
+        std::make_unique<FPGAManagedStreams::FPGAToCPUDriver>(
+            params,
+            (void *)(fpga_address_memory_base + offset),
+            offset,
+            mmio_read_func,
+            mmio_write_func));
+    offset += params.buffer_capacity;
+  }
+
+#endif // FPGAMANAGEDSTREAMENGINE_0_PRESENT
 }

 simif_emul_t::~simif_emul_t(){};

+void simif_emul_t::host_mmio_init() {
+  for (auto &stream : this->fpga_to_cpu_streams) {
+    stream->init();
+  }
+  for (auto &stream : this->cpu_to_fpga_streams) {
+    stream->init();
+  }
+};
+
 int simif_emul_t::run() {
  if (fastloadmem && !load_mem_path.empty()) {
    fprintf(stdout, "[fast loadmem] %s\n", load_mem_path.c_str());
@ -119,8 +159,8 @@ size_t simif_emul_t::pull(unsigned stream_idx,
                          void *dest,
                          size_t num_bytes,
                          size_t threshold_bytes) {
-  assert(stream_idx < to_host_streams.size());
-  return this->to_host_streams[stream_idx].pull(
+  assert(stream_idx < fpga_to_cpu_streams.size());
+  return this->fpga_to_cpu_streams[stream_idx]->pull(
      dest, num_bytes, threshold_bytes);
 }

@ -128,11 +168,21 @@ size_t simif_emul_t::push(unsigned stream_idx,
                          void *src,
                          size_t num_bytes,
                          size_t threshold_bytes) {
-  assert(stream_idx < from_host_streams.size());
-  return this->from_host_streams[stream_idx].push(
+  assert(stream_idx < cpu_to_fpga_streams.size());
+  return this->cpu_to_fpga_streams[stream_idx]->push(
      src, num_bytes, threshold_bytes);
 }

+void simif_emul_t::pull_flush(unsigned stream_idx) {
+  assert(stream_idx < fpga_to_cpu_streams.size());
+  return this->fpga_to_cpu_streams[stream_idx]->flush();
+}
+
+void simif_emul_t::push_flush(unsigned stream_idx) {
+  assert(stream_idx < cpu_to_fpga_streams.size());
+  return this->cpu_to_fpga_streams[stream_idx]->flush();
+}
+
 size_t
 simif_emul_t::cpu_managed_axi4_read(size_t addr, char *data, size_t size) {
  ssize_t len = (size - 1) / CPU_MANAGED_AXI4_BEAT_BYTES;
--- a/sim/midas/src/main/cc/simif_emul.h
+++ b/sim/midas/src/main/cc/simif_emul.h
@ -3,6 +3,7 @@
 #ifndef __SIMIF_EMUL_H
 #define __SIMIF_EMUL_H

+#include <memory>
 #include <vector>

 #include "bridges/cpu_managed_stream.h"
@ -21,6 +22,8 @@ public:

  virtual void sim_init() = 0;

+  void host_mmio_init() override;
+
  void write(size_t addr, uint32_t data) override;
  uint32_t read(size_t addr) override;

@ -32,6 +35,10 @@ public:
              void *src,
              size_t num_bytes,
              size_t threshold_bytes) override;
+
+  void pull_flush(unsigned int stream_no) override;
+  void push_flush(unsigned int stream_no) override;
+
  /**
   * @brief Pointers to inter-context (i.e., between VCS/verilator and driver)
   * AXI4 transaction channels
@ -89,8 +96,8 @@ protected:
  // Writes directly into the host DRAM models to initialize them.
  void load_mems(const char *fname);

-  std::vector<StreamToCPU> to_host_streams;
-  std::vector<StreamFromCPU> from_host_streams;
+  std::vector<std::unique_ptr<FPGAToCPUStreamDriver>> fpga_to_cpu_streams;
+  std::vector<std::unique_ptr<CPUToFPGAStreamDriver>> cpu_to_fpga_streams;
 };

 #endif // __SIMIF_EMUL_H
--- a/sim/midas/src/main/cc/simif_f1.cc
+++ b/sim/midas/src/main/cc/simif_f1.cc
@ -36,25 +36,25 @@ simif_f1_t::simif_f1_t(const std::vector<std::string> &args) : simif_t(args) {
      std::bind(&simif_f1_t::cpu_managed_axi4_write, this, _1, _2, _3);

  for (int i = 0; i < CPUMANAGEDSTREAMENGINE_0_from_cpu_stream_count; i++) {
-    auto params = CPUManagedStreamParameters(
+    auto params = CPUManagedStreams::StreamParameters(
        std::string(CPUMANAGEDSTREAMENGINE_0_from_cpu_names[i]),
        CPUMANAGEDSTREAMENGINE_0_from_cpu_dma_addrs[i],
        CPUMANAGEDSTREAMENGINE_0_from_cpu_count_addrs[i],
        CPUMANAGEDSTREAMENGINE_0_from_cpu_buffer_sizes[i]);

-    from_host_streams.push_back(
-        StreamFromCPU(params, mmio_read_func, cpu_managed_axi4_write_func));
+    from_host_streams.push_back(CPUManagedStreams::CPUToFPGADriver(
+        params, mmio_read_func, cpu_managed_axi4_write_func));
  }

  for (int i = 0; i < CPUMANAGEDSTREAMENGINE_0_to_cpu_stream_count; i++) {
-    auto params = CPUManagedStreamParameters(
+    auto params = CPUManagedStreams::StreamParameters(
        std::string(CPUMANAGEDSTREAMENGINE_0_to_cpu_names[i]),
        CPUMANAGEDSTREAMENGINE_0_to_cpu_dma_addrs[i],
        CPUMANAGEDSTREAMENGINE_0_to_cpu_count_addrs[i],
        CPUMANAGEDSTREAMENGINE_0_to_cpu_buffer_sizes[i]);

-    to_host_streams.push_back(
-        StreamToCPU(params, mmio_read_func, cpu_managed_axi4_read_func));
+    to_host_streams.push_back(CPUManagedStreams::FPGAToCPUDriver(
+        params, mmio_read_func, cpu_managed_axi4_read_func));
  }
 }

--- a/sim/midas/src/main/cc/simif_f1.h
+++ b/sim/midas/src/main/cc/simif_f1.h
@ -14,8 +14,9 @@ public:
  simif_f1_t(const std::vector<std::string> &args);
  ~simif_f1_t();

-  // Unused by F1 since initialization / deinitization is done in the
-  // constructor
+  // Unused since no F1-specific MMIO is required to setup the simulation.
+  void host_mmio_init() override{};
+
  int run() override { return simulation_run(); }

  void write(size_t addr, uint32_t data) override;
@ -28,6 +29,10 @@ public:
              void *src,
              size_t num_bytes,
              size_t threshold_bytes) override;
+
+  void pull_flush(unsigned int stream_no) override {}
+  void push_flush(unsigned int stream_no) override {}
+
  uint32_t is_write_ready();
  void check_rc(int rc, char *infostr);
  void fpga_shutdown();
@ -37,8 +42,8 @@ private:
  char in_buf[CTRL_BEAT_BYTES];
  char out_buf[CTRL_BEAT_BYTES];

-  std::vector<StreamToCPU> to_host_streams;
-  std::vector<StreamFromCPU> from_host_streams;
+  std::vector<CPUManagedStreams::FPGAToCPUDriver> to_host_streams;
+  std::vector<CPUManagedStreams::CPUToFPGADriver> from_host_streams;

  size_t cpu_managed_axi4_write(size_t addr, char *data, size_t size);
  size_t cpu_managed_axi4_read(size_t addr, char *data, size_t size);
--- a/sim/midas/src/main/cc/simif_vitis.h
+++ b/sim/midas/src/main/cc/simif_vitis.h
@ -12,8 +12,10 @@ public:
  simif_vitis_t(const std::vector<std::string> &args);
  ~simif_vitis_t() {}

-  // Unused by Vitis since initialization / deinitization is done in the
-  // constructor
+  // Will be used once FPGA-managed AXI4 is fully plumbed through the shim
+  // to setup the FPGAManagedStream engine.
+  void host_mmio_init() override{};
+
  int run() override { return simulation_run(); }

  void write(size_t addr, uint32_t data) override;
--- a/sim/midas/src/main/scala/midas/core/FPGAManagedStreamEngine.scala
+++ b/sim/midas/src/main/scala/midas/core/FPGAManagedStreamEngine.scala
@ -5,20 +5,280 @@ package midas.core
 import chisel3._
 import chisel3.util._
 import freechips.rocketchip.amba.axi4._
-import freechips.rocketchip.config.{Parameters, Field}
+import freechips.rocketchip.config.{Field, Parameters}
 import freechips.rocketchip.diplomacy._

 import midas.widgets._
+import midas.widgets.CppGenerationUtils._
+
+class WriteMetadata(val numBeatsWidth: Int) extends Bundle {
+  val numBeats = Output(UInt(numBeatsWidth.W))
+  val isFlush  = Output(Bool())
+}

-/**
-  * This is a stub to foreshadow the other implementation
-  */
 class FPGAManagedStreamEngine(p: Parameters, val params: StreamEngineParameters) extends StreamEngine(p) {
+  require(sinkParams.isEmpty, "FPGAManagedStreamEngine does not currently support FPGA-sunk streams.")
+
+  // Beats refers to 512b words moving over a stream
+  val pageBytes = 4096
+  val beatBytes = BridgeStreamConstants.streamWidthBits / 8
+  val pageBeats = pageBytes / beatBytes
+
+  def maxFlightForStream(params: StreamSourceParameters): Int =
+    (params.fpgaBufferDepth * beatBytes) / pageBytes
+
  val cpuManagedAXI4NodeOpt = None
-  val fpgaManagedAXI4NodeOpt = Some(midas.widgets.AXI4TieOff()(p))
+
+  val (fpgaManagedAXI4NodeOpt, toCPUNode) = if (hasStreams) {
+    // The implicit val defined in StreamEngine is not accessible here; Make a
+    // duplicate that can be referenced by diplomatic nodes
+    implicit val pShadow = p
+    val xbar             = AXI4Xbar()
+    val toCPUNode        = AXI4MasterNode(
+      sourceParams.map { p =>
+        AXI4MasterPortParameters(Seq(AXI4MasterParameters(name = p.name, maxFlight = Some(maxFlightForStream(p)))))
+      }
+    )
+    xbar :=* AXI4Buffer() :=* toCPUNode
+    (Some(xbar), Some(toCPUNode))
+  } else {
+    (None, None)
+  }

  lazy val module = new WidgetImp(this) {
    val io = IO(new WidgetIO)
+
+    case class ToCPUStreamDriverParameters(
+      name:                      String,
+      fpgaBufferDepth:           Int,
+      toHostPhysAddrHighAddr:    Int,
+      toHostPhysAddrLowAddr:     Int,
+      bytesAvailableAddr:        Int,
+      bytesConsumedAddr:         Int,
+      toHostStreamDoneInitAddr:  Int,
+      toHostStreamFlushAddr:     Int,
+      toHostStreamFlushDoneAddr: Int,
+    )
+
+    // Invoke this in the module implementation
+    def elaborateToHostCPUStream(
+      channel:  DecoupledIO[UInt],
+      axi4:     AXI4Bundle,
+      chParams: StreamSourceParameters,
+    ): ToCPUStreamDriverParameters = {
+
+      require(
+        BridgeStreamConstants.streamWidthBits == axi4.params.dataBits,
+        s"FPGAManagedStreamEngine requires stream widths to match FPGA-managed AXI4 data width",
+      )
+      val cpuBufferDepthBeats = chParams.fpgaBufferDepth
+      require(cpuBufferDepthBeats > pageBeats)
+      val cpuBufferSizeBytes  = (1 << log2Ceil(cpuBufferDepthBeats)) * (BridgeStreamConstants.streamWidthBits / 8)
+      // This to simplify the hardware
+      require(isPow2(cpuBufferSizeBytes))
+
+      val toHostPhysAddrHigh = Reg(UInt(32.W))
+      val toHostPhysAddrLow  = Reg(UInt(32.W))
+      val bytesConsumedByCPU = RegInit(0.U(log2Ceil(cpuBufferSizeBytes + 1).W))
+
+      // This sets up a double buffer that should give full throughput for a
+      // single stream system. This queue could be grown under a multi-stream system.
+      val outgoingQueue = Module(new BRAMQueue(2 * pageBeats)(UInt(BridgeStreamConstants.streamWidthBits.W)))
+      outgoingQueue.io.enq <> channel
+
+      val writeCredits       = RegInit(cpuBufferSizeBytes.U(log2Ceil(cpuBufferSizeBytes + 1).W))
+      val readCredits        = RegInit(0.U(log2Ceil(cpuBufferSizeBytes + 1).W))
+      val writePtr           = RegInit(0.U(log2Ceil(cpuBufferSizeBytes).W))
+      val doneInit           = RegInit(false.B)
+      // Key assumption: write acknowledgements can be used as a synchronization
+      // point, after which the CPU can read new data written into its circular
+      // buffer. This tracks inflight requests, to increment read credits on
+      // write acknowledgement, and to cap maxflight.
+      val inflightBeatCounts = Module(
+        new Queue(new WriteMetadata(log2Ceil(pageBeats + 1)), maxFlightForStream(chParams))
+      )
+
+      val idle :: sendAddress :: sendData :: Nil = Enum(3)
+      val state                                  = RegInit(idle)
+      val beatsToSendMinus1                      = RegInit(0.U(log2Ceil(pageBeats).W))
+
+      // Ensure we do not cross page boundaries per AXI4 spec.
+      val beatsToPageBoundary =
+        pageBeats.U - writePtr(log2Ceil(pageBytes) - 1, log2Ceil(beatBytes))
+      assert((beatsToPageBoundary > 0.U) && (beatsToPageBoundary <= (pageBeats.U)))
+
+      // Establish the largest AXI4 write request we can make, by doing a min
+      // reduction over the following bounds:
+      val writeBounds = Seq(
+        outgoingQueue.io.count,                // Beats available for enqueue in local FPGA buffer
+        writeCredits >> log2Ceil(beatBytes).U, // Space available in cpu buffer
+        beatsToPageBoundary,
+      ) // Length to end of page
+      // NB: BeatsToPageBoundary covers the end of the circular buffer only because
+      // we ensure the buffer size is a multiple of page size
+
+      val writeableBeats       = writeBounds.reduce { (a, b) => Mux(a < b, a, b) }
+      val writeableBeatsMinus1 = writeableBeats - 1.U
+
+      // This register resets itself to 0 on cycles it is not set by the host
+      // CPU.  If it is non-zero it was written to in the last cycle, and so we
+      // know we can update credits.
+      assert(
+        !doneInit || (!(RegNext(bytesConsumedByCPU) =/= 0.U) || (bytesConsumedByCPU === 0.U)),
+        "Back-to-back MMIO accesses, or incorrect toggling on bytesConsumedByCPU",
+      )
+      when(bytesConsumedByCPU =/= 0.U) {
+        bytesConsumedByCPU := 0.U
+        writeCredits       := writeCredits + bytesConsumedByCPU
+        readCredits        := readCredits - bytesConsumedByCPU
+      }
+
+      val doFlush, inFlush                   = RegInit(false.B)
+      val flushBeatsToIssue, flushBeatsToAck = RegInit(0.U(log2Ceil(cpuBufferDepthBeats + 1).W))
+
+      assert(readCredits >= bytesConsumedByCPU, "Driver read more bytes than available in circular buffer.")
+      assert(
+        (writeCredits + bytesConsumedByCPU) <= cpuBufferSizeBytes.U,
+        "Driver granted more write credit than physically allowable.",
+      )
+
+      switch(state) {
+        is(idle) {
+          doFlush := false.B
+          when(doFlush && !inFlush && (outgoingQueue.io.count > 0.U)) {
+            inFlush           := true.B
+            flushBeatsToIssue := outgoingQueue.io.count
+            flushBeatsToAck   := outgoingQueue.io.count
+          }
+          val start =
+            (inflightBeatCounts.io.enq.ready) &&
+              ((flushBeatsToIssue =/= 0.U) || (writeableBeats === beatsToPageBoundary))
+
+          when(start) { state := sendAddress }
+        }
+        is(sendAddress) {
+          when(axi4.aw.fire) {
+            state             := sendData
+            beatsToSendMinus1 := writeableBeatsMinus1
+            writePtr          := writePtr + (writeableBeats * beatBytes.U)
+            writeCredits      := writeCredits + bytesConsumedByCPU - (writeableBeats * beatBytes.U)
+            flushBeatsToIssue := Mux(flushBeatsToIssue < writeableBeats, 0.U, flushBeatsToIssue - writeableBeats)
+          }
+        }
+        is(sendData) {
+          when(axi4.w.fire) {
+            state             := Mux(axi4.w.bits.last, idle, sendData)
+            beatsToSendMinus1 := beatsToSendMinus1 - 1.U
+          }
+        }
+      }
+
+      axi4.aw.valid      := (state === sendAddress)
+      axi4.aw.bits.id    := 0.U
+      axi4.aw.bits.addr  := Cat(toHostPhysAddrHigh, toHostPhysAddrLow) + writePtr
+      axi4.aw.bits.len   := writeableBeatsMinus1
+      axi4.aw.bits.size  := (log2Ceil(beatBytes)).U
+      // This is assumed but not exposed by the PCIM interface, and is the
+      // default transaction type supported by XDMA-backed AXI4 IFs anyways
+      axi4.aw.bits.burst := AXI4Parameters.BURST_INCR
+      // This to permit intermediate width adapters, etc, to pack narrower
+      // transactions into larger ones, in the event we make this IF narrower than 512b
+      axi4.aw.bits.cache := AXI4Parameters.CACHE_MODIFIABLE
+      // Assume page-sized transfers for now
+      // These fields are unused by F1 PCIM, but pick reasonable default values for future proofing
+      axi4.aw.bits.prot  := 0.U // Unpriviledged, secure, data access
+      axi4.aw.bits.qos   := 0.U // Default; unused
+      axi4.aw.bits.lock  := 0.U // Normal, non-exclusive
+
+      inflightBeatCounts.io.enq.valid         := axi4.aw.fire
+      inflightBeatCounts.io.enq.bits.numBeats := writeableBeats
+      inflightBeatCounts.io.enq.bits.isFlush  := flushBeatsToIssue =/= 0.U
+
+      axi4.w.valid               := (state === sendData) && outgoingQueue.io.deq.valid
+      axi4.w.bits.data           := outgoingQueue.io.deq.bits
+      axi4.w.bits.strb           := ((BigInt(1) << beatBytes) - 1).U
+      axi4.w.bits.last           := beatsToSendMinus1 === 0.U
+      outgoingQueue.io.deq.ready := (state === sendData) && axi4.w.ready
+
+      // Write Response handling
+      axi4.b.ready := true.B
+
+      val ackBeats = inflightBeatCounts.io.deq.bits.numBeats
+      val ackFlush = inflightBeatCounts.io.deq.bits.isFlush
+      when(axi4.b.fire) {
+        readCredits := readCredits + (ackBeats * beatBytes.U) - bytesConsumedByCPU
+        when(ackFlush) {
+          val remainingBeatsToAck = Mux(ackBeats < flushBeatsToAck, flushBeatsToAck - ackBeats, 0.U)
+          flushBeatsToAck := remainingBeatsToAck
+          inFlush         := remainingBeatsToAck =/= 0.U
+        }
+      }
+      inflightBeatCounts.io.deq.ready := axi4.b.fire
+      assert(!axi4.b.valid || inflightBeatCounts.io.deq.valid)
+
+      // We only use the write channels to implement FPGA-to-CPU streams
+      axi4.ar.valid := false.B
+      axi4.r.ready  := false.B
+
+      // Register Driver-programmable MMIO registers
+      ToCPUStreamDriverParameters(
+        chParams.name,
+        cpuBufferSizeBytes,
+        attach(toHostPhysAddrHigh, s"${chParams.name}_toHostPhysAddrHigh"),
+        attach(toHostPhysAddrLow, s"${chParams.name}_toHostPhysAddrLow"),
+        attach(readCredits, s"${chParams.name}_bytesAvailable", ReadOnly),
+        attach(bytesConsumedByCPU, s"${chParams.name}_bytesConsumed"),
+        attach(doneInit, s"${chParams.name}_toHostStreamDoneInit"),
+        attach(doFlush, s"${chParams.name}_toHostStreamFlush"),
+        attach(!(doFlush || inFlush), s"${chParams.name}_toHostStreamFlushDone", ReadOnly),
+      )
+    }
+
+    val sourceDriverParameters = if (hasStreams) {
+      val axi4Bundles = toCPUNode.get.out.map(_._1)
+      (for (((axi4IF, streamIF), params) <- axi4Bundles.zip(streamsToHostCPU).zip(sourceParams)) yield {
+        chisel3.experimental.prefix(params.name) {
+          elaborateToHostCPUStream(streamIF, axi4IF, params)
+        }
+      }).toSeq
+    } else {
+      Seq()
+    }
+
    genCRFile()
+
+    override def genHeader(base: BigInt, sb: StringBuilder) {
+      val headerWidgetName = getWName.toUpperCase
+      super.genHeader(base, sb)
+
+      def serializeStreamParameters(prefix: String, params: Seq[ToCPUStreamDriverParameters]): Unit = {
+        val numStreams = params.size
+        sb.append(genConstStatic(s"${headerWidgetName}_${prefix}_stream_count", UInt32(numStreams)))
+
+        // Hack: avoid emitting a zero-sized array by providing a dummy set of
+        // parameters when no streams are generated. This is a limitation of the
+        // current C emission strategy. Note, the actual number of streams is still reported above.
+        val placeholder    = ToCPUStreamDriverParameters("UNUSED", 0, 0, 0, 0, 0, 0, 0, 0)
+        val nonEmptyParams = if (numStreams == 0) Seq(placeholder) else params
+
+        val arraysToEmit = Seq(
+          "names"                      -> nonEmptyParams.map { p => CStrLit(p.name) },
+          "fpgaBufferDepth"            -> nonEmptyParams.map { p => UInt32(p.fpgaBufferDepth) },
+          "toHostPhysAddrHighAddrs"    -> nonEmptyParams.map { p => UInt64(base + p.toHostPhysAddrHighAddr) },
+          "toHostPhysAddrLowAddrs"     -> nonEmptyParams.map { p => UInt64(base + p.toHostPhysAddrLowAddr) },
+          "bytesAvailableAddrs"        -> nonEmptyParams.map { p => UInt64(base + p.bytesAvailableAddr) },
+          "bytesConsumedAddrs"         -> nonEmptyParams.map { p => UInt64(base + p.bytesConsumedAddr) },
+          "toHostStreamDoneInitAddrs"  -> nonEmptyParams.map { p => UInt64(base + p.toHostStreamDoneInitAddr) },
+          "toHostStreamFlushAddrs"     -> nonEmptyParams.map { p => UInt64(base + p.toHostStreamFlushAddr) },
+          "toHostStreamFlushDoneAddrs" -> nonEmptyParams.map { p => UInt64(base + p.toHostStreamFlushDoneAddr) },
+        )
+
+        for ((name, values) <- arraysToEmit) {
+          sb.append(genArray(s"${headerWidgetName}_${prefix}_${name}", values))
+        }
+      }
+
+      serializeStreamParameters("to_cpu", sourceDriverParameters)
+    }
  }
 }
--- a/sim/midas/src/main/scala/midas/core/FPGATop.scala
+++ b/sim/midas/src/main/scala/midas/core/FPGATop.scala
@ -317,8 +317,11 @@ class FPGATop(implicit p: Parameters) extends LazyModule with HasWidgets {
        beatBytes = params.dataBits / 8)
    ))

-    streamingEngine.fpgaManagedAXI4NodeOpt.foreach {
-      node := AXI4IdIndexer(params.idBits) := AXI4Buffer() := _
+    streamingEngine.fpgaManagedAXI4NodeOpt match {
+      case Some(engineNode) =>
+        node := AXI4IdIndexer(params.idBits) := AXI4Buffer() := engineNode
+      case None =>
+        node := AXI4TieOff()
    }
    (node, params)
  }
--- a/sim/midas/src/main/scala/midas/widgets/UsesBridgeStreams.scala
+++ b/sim/midas/src/main/scala/midas/widgets/UsesBridgeStreams.scala
@ -17,7 +17,7 @@ import midas.core.{
 /**
  * Bridge Streams serve as means to do bulk transport from BridgeDriver to
  * BridgeModule and vice versa.  Abstractly, they can be thought of as a 512b
-  * wide latency-insensitive channel (i.e., queue).
+  * wide latency-insensitive channel (i.e., a queue with some unknown latency).
  *
  * The two mixins in this file implement the two directions of
  * producer-consumer relationships: [[StreamFromHostCPU]] add a stream in
--- a/sim/src/main/scala/midasexamples/Config.scala
+++ b/sim/src/main/scala/midasexamples/Config.scala
@ -14,6 +14,7 @@ class NoConfig extends Config(Parameters.empty)
 class BaseMidasExamplesConfig extends Config(
  new WithDefaultMemModel ++
  new WithWiringTransform ++
+  new HostDebugFeatures ++
  new Config((site, here, up) => {
    case SynthAsserts => true
    case GenerateMultiCycleRamModels => true
--- a/sim/src/test/scala/midasexamples/TutorialSuite.scala
+++ b/sim/src/test/scala/midasexamples/TutorialSuite.scala
@ -6,22 +6,31 @@ import scala.util.matching.Regex
 import scala.io.Source
 import org.scalatest.Suites
 import org.scalatest.matchers.should._
+import freechips.rocketchip.config.Config
+
+object BaseConfigs {
+  def f1 = Seq(classOf[DefaultF1Config])
+  def vitis = Seq(classOf[DefaultVitisConfig])
+}

 abstract class TutorialSuite(
    val targetName: String, // See GeneratorUtils
    targetConfigs: String = "NoConfig",
-    platformConfigs: String = "HostDebugFeatures_DefaultF1Config",
+    platformConfigs: Seq[Class[_ <: Config]] = Seq(),
    tracelen: Int = 8,
    simulationArgs: Seq[String] = Seq()
  ) extends firesim.TestSuiteCommon with Matchers {

+  lazy val basePlatformConfig = BaseConfigs.f1.asInstanceOf[Seq[Class[_ <: Config]]]
  val backendSimulator = "verilator"
+  def platformConfigString = (platformConfigs ++ basePlatformConfig).map(_.getSimpleName).mkString("_")

-  val targetTuple = s"$targetName-$targetConfigs-$platformConfigs"
+
+  val targetTuple = s"$targetName-$targetConfigs-${platformConfigString}"
  val commonMakeArgs = Seq(s"TARGET_PROJECT=midasexamples",
                           s"DESIGN=$targetName",
                           s"TARGET_CONFIG=${targetConfigs}",
-                           s"PLATFORM_CONFIG=${platformConfigs}")
+                           s"PLATFORM_CONFIG=${platformConfigString}")

  def run(backend: String,
          debug: Boolean = false,
@ -181,14 +190,17 @@ abstract class TutorialSuite(

 //class PointerChaserF1Test extends TutorialSuite(
 //  "PointerChaser", "PointerChaserConfig", simulationArgs = Seq("`cat runtime.conf`"))
+
 class GCDF1Test extends TutorialSuite("GCD")
+class GCDVitisTest extends GCDF1Test { override lazy val basePlatformConfig = BaseConfigs.vitis }
+
 // Hijack Parity to test all of the Midas-level backends
 class ParityF1Test extends TutorialSuite("Parity") {
  runTest("verilator", true)
  runTest("vcs", true)
 }
-
-class ParityVitisTest extends TutorialSuite("Parity", platformConfigs = classOf[DefaultVitisConfig].getSimpleName) {
+class ParityVitisTest extends TutorialSuite("Parity") {
+  override lazy val basePlatformConfig = BaseConfigs.vitis
  runTest("verilator", true)
  runTest("vcs", true)
 }
@ -254,7 +266,7 @@ class AutoCounterCoverModuleF1Test extends TutorialSuite("AutoCounterCoverModule
 }
 class AutoCounterPrintfF1Test extends TutorialSuite("AutoCounterPrintfModule",
    simulationArgs = Seq("+print-file=synthprinttest.out"),
-    platformConfigs = "AutoCounterPrintf_HostDebugFeatures_DefaultF1Config") {
+    platformConfigs = classOf[AutoCounterPrintf] +: BaseConfigs.f1) {
  diffSynthesizedLog("synthprinttest.out0", stdoutPrefix = "AUTOCOUNTER_PRINT CYCLE", synthPrefix = "CYCLE")
 }
 class AutoCounterGlobalResetConditionF1Test extends TutorialSuite("AutoCounterGlobalResetCondition",
@ -282,8 +294,12 @@ class AutoCounterGlobalResetConditionF1Test extends TutorialSuite("AutoCounterGl

 class PrintfModuleF1Test extends TutorialSuite("PrintfModule",
  simulationArgs = Seq("+print-no-cycle-prefix", "+print-file=synthprinttest.out")) {
+  runTest("vcs", true)
  diffSynthesizedLog("synthprinttest.out0")
 }
+
+class PrintfModuleVitisTest extends PrintfModuleF1Test { override lazy val basePlatformConfig = BaseConfigs.vitis }
+
 class NarrowPrintfModuleF1Test extends TutorialSuite("NarrowPrintfModule",
  simulationArgs = Seq("+print-no-cycle-prefix", "+print-file=synthprinttest.out")) {
  diffSynthesizedLog("synthprinttest.out0")
@ -353,6 +369,8 @@ class MulticlockPrintF1Test extends TutorialSuite("MulticlockPrintfModule",
    synthLinesToDrop = 4)
 }

+class MulticlockPrintVitisTest extends MulticlockPrintF1Test { override lazy val basePlatformConfig = BaseConfigs.vitis }
+
 class MulticlockAutoCounterF1Test extends TutorialSuite("MulticlockAutoCounterModule",
    simulationArgs = Seq("+autocounter-readrate=1000", "+autocounter-filename-base=autocounter")) {
  checkAutoCounterCSV("autocounter0.csv", "AUTOCOUNTER_PRINT ")
@ -395,7 +413,7 @@ class PassthroughModelBridgeSourceTest extends TutorialSuite("PassthroughModelBr
 class ResetPulseBridgeActiveHighTest extends TutorialSuite(
    "ResetPulseBridgeTest",
    // Disable assertion synthesis to rely on native chisel assertions to catch bad behavior
-    platformConfigs = "NoSynthAsserts_HostDebugFeatures_DefaultF1Config",
+    platformConfigs = classOf[NoSynthAsserts] +: BaseConfigs.f1,
    simulationArgs = Seq(s"+reset-pulse-length0=${ResetPulseBridgeTestConsts.maxPulseLength}")) {
  runTest(backendSimulator,
    args = Seq(s"+reset-pulse-length0=${ResetPulseBridgeTestConsts.maxPulseLength + 1}"),
@ -405,7 +423,7 @@ class ResetPulseBridgeActiveHighTest extends TutorialSuite(
 class ResetPulseBridgeActiveLowTest extends TutorialSuite(
    "ResetPulseBridgeTest",
    targetConfigs = "ResetPulseBridgeActiveLowConfig",
-    platformConfigs = "NoSynthAsserts_HostDebugFeatures_DefaultF1Config",
+    platformConfigs = classOf[NoSynthAsserts] +: BaseConfigs.f1,
    simulationArgs = Seq(s"+reset-pulse-length0=${ResetPulseBridgeTestConsts.maxPulseLength}")) {
  runTest(backendSimulator,
    args = Seq(s"+reset-pulse-length0=${ResetPulseBridgeTestConsts.maxPulseLength + 1}"),
@ -434,8 +452,7 @@ class CustomConstraintsF1Test extends TutorialSuite("CustomConstraints") {
    atLeast (1, xdc) should fullyMatch regex "constrain_impl2 \\[reg WRAPPER_INST/CL/firesim_top/.*/dut/r1]".r
  }
 }
-
-// Suite Collections
+// Midasexample Suite Collections
 class ChiselExampleDesigns extends Suites(
  new GCDF1Test,
  new ParityF1Test,
@ -499,6 +516,13 @@ class FMRCITests extends Suites(
  new PassthroughModelBridgeSourceTest,
 )

+class VitisCITests extends Suites (
+  new GCDVitisTest,
+  new ParityVitisTest,
+  new PrintfModuleVitisTest,
+  new MulticlockPrintVitisTest,
+)
+
 // These groups are vestigial from CircleCI container limits
 class CIGroupA extends Suites(
  new ChiselExampleDesigns,
@ -515,5 +539,6 @@ class CIGroupB extends Suites(
  new firesim.fasedtests.CIGroupB,
  new firesim.AllMidasUnitTests,
  new firesim.FailingUnitTests,
-  new FMRCITests
+  new FMRCITests,
+  new VitisCITests
 )