Merge branch 'main' into update-ci

2022-10-19 11:30:58 -07:00 · 2022-10-19 11:30:58 -07:00 · 62f5daad31
parent 66aab42755 dacae820b1
commit 62f5daad31
19 changed files with 333 additions and 114 deletions
--- a/.github/scripts/setup-manager-self-hosted.py
+++ b/.github/scripts/setup-manager-self-hosted.py
@ -20,11 +20,6 @@ def wait_machine_launch_complete():
            run("cat /machine-launchstatus.log")
            raise Exception("machine-launch-script.sh failed to run")

-    # increase file descriptor limit system wide so that newer versions of
-    # buildroot don't fail. See discussion in https://github.com/firesim/firesim/pull/1132.
-    sudo("echo '* hard nofile 16384' >> /etc/security/limits.conf")
-    sudo("echo '* soft nofile 16384' >> /etc/security/limits.conf")
-
 def setup_self_hosted_runners():
    """ Installs GHA self-hosted runner machinery on the manager.  """

--- a/.github/workflows/firesim-run-tests.yml
+++ b/.github/workflows/firesim-run-tests.yml
@ -264,22 +264,23 @@ jobs:
      - name: Run linux-poweroff test w/ externally provisioned (AWS EC2) run farm
        run: .github/scripts/run-linux-poweroff-externally-provisioned.py

-  run-basic-linux-poweroff-vitis:
-    if: contains(github.event.pull_request.labels.*.name, 'ci:fpga-deploy')
-    name: run-basic-linux-poweroff-vitis
-    runs-on: local-fpga
-    env:
-      TERM: xterm-256-color
-    steps:
-      # This forces a fresh clone of the repo during the `checkout` step
-      # to resolve stale submodule URLs. See https://github.com/ucb-bar/chipyard/pull/1156.
-      - name: Delete old checkout
-        run: |
-          rm -rf ${{ github.workspace }}/* || true
-          rm -rf ${{ github.workspace }}/.* || true
-      - uses: actions/checkout@v3
-      - name: Run simple linux poweroff test w/ vitis
-        run: .github/scripts/run-linux-poweroff-vitis.py
+# AJG: disable temporarily due to local CI machine issues
+#  run-basic-linux-poweroff-vitis:
+#    if: contains(github.event.pull_request.labels.*.name, 'ci:fpga-deploy')
+#    name: run-basic-linux-poweroff-vitis
+#    runs-on: local-fpga
+#    env:
+#      TERM: xterm-256-color
+#    steps:
+#      # This forces a fresh clone of the repo during the `checkout` step
+#      # to resolve stale submodule URLs. See https://github.com/ucb-bar/chipyard/pull/1156.
+#      - name: Delete old checkout
+#        run: |
+#          rm -rf ${{ github.workspace }}/* || true
+#          rm -rf ${{ github.workspace }}/.* || true
+#      - uses: actions/checkout@v3
+#      - name: Run simple linux poweroff test w/ vitis
+#        run: .github/scripts/run-linux-poweroff-vitis.py

  documentation-check:
    name: documentation-check
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -2,6 +2,25 @@

 This changelog follows the format defined here: https://keepachangelog.com/en/1.0.0/

+## [1.15.1] - 2022-10-18
+
+Fixes to metasimulation, TracerV, and improved cross-platform support.
+
+### Added
+* sourceme-f1-manager.sh now has a --skip-ssh-setup argument for users who have pre-set ssh-agent config #1266
+
+### Changed
+* Instance liveness check now checks to see if login shell is reasonable #1266
+* Driver/Metasim build at runtime now executed via run() to avoid conda warnings #1266
+* Setup for QCOW2 on a run farm is only performed if the simulation needs it #1266
+* The sim launch command is now written to a file before being executed for easier debugging. #1266
+
+### Fixed
+* Fix missing code in RuntimeBuildRecipeConfig that broke metasims #1266
+* Hide warnings from sudo check, guestmount, etc. #1266
+* Open file limit increased by default in machine-launch-script to work around buildroot bug. #1266
+* TracerV: fix loop bounds in token processing #1249
+
 ## [1.15.0] - 2022-09-30

 Full migration to Conda-based environment/dependency management; Chipyard now also uses Conda. Bump Rocket Chip/Chisel/etc. Various bugfixes/feature improvements.
--- a/build-setup-nolog.sh
+++ b/build-setup-nolog.sh
@ -2,8 +2,6 @@

 # FireSim initial setup script.

-# TODO: build FireSim linux distro here?
-
 # exit script if any command fails
 set -e
 set -o pipefail
@ -184,6 +182,7 @@ if [ "$IS_LIBRARY" = false ]; then

    env_append "export PATH=$RDIR/sw/firesim-software:\$PATH"

+    env_append "source $RDIR/scripts/fix-open-files.sh"
 else
    # FireMarshal setup
    target_chipyard_dir="$RDIR/../.."
--- a/deploy/buildtools/bitbuilder.py
+++ b/deploy/buildtools/bitbuilder.py
@ -115,7 +115,7 @@ class F1BitBuilder(BitBuilder):
            prefix(f'export RISCV={os.getenv("RISCV", "")}'), \
            prefix(f'export PATH={os.getenv("PATH", "")}'), \
            prefix(f'export LD_LIBRARY_PATH={os.getenv("LD_LIBRARY_PATH", "")}'), \
-            prefix('source sourceme-f1-manager.sh'), \
+            prefix('source sourceme-f1-manager.sh --skip-ssh-setup'), \
            prefix('cd sim/'):
            run(self.build_config.make_recipe("PLATFORM=f1 replace-rtl"))

@ -126,7 +126,7 @@ class F1BitBuilder(BitBuilder):
            prefix(f'export RISCV={os.getenv("RISCV", "")}'), \
            prefix(f'export PATH={os.getenv("PATH", "")}'), \
            prefix(f'export LD_LIBRARY_PATH={os.getenv("LD_LIBRARY_PATH", "")}'), \
-            prefix('source sourceme-f1-manager.sh'), \
+            prefix('source sourceme-f1-manager.sh --skip-ssh-setup'), \
            prefix('cd sim/'):
            run(self.build_config.make_recipe("PLATFORM=f1 driver"))

@ -373,7 +373,7 @@ class VitisBitBuilder(BitBuilder):
            prefix(f'export RISCV={os.getenv("RISCV", "")}'), \
            prefix(f'export PATH={os.getenv("PATH", "")}'), \
            prefix(f'export LD_LIBRARY_PATH={os.getenv("LD_LIBRARY_PATH", "")}'), \
-            prefix('source sourceme-f1-manager.sh'), \
+            prefix('source sourceme-f1-manager.sh --skip-ssh-setup'), \
            prefix('cd sim/'):
            run(self.build_config.make_recipe("PLATFORM=vitis replace-rtl"))

@ -384,7 +384,7 @@ class VitisBitBuilder(BitBuilder):
            prefix(f'export RISCV={os.getenv("RISCV", "")}'), \
            prefix(f'export PATH={os.getenv("PATH", "")}'), \
            prefix(f'export LD_LIBRARY_PATH={os.getenv("LD_LIBRARY_PATH", "")}'), \
-            prefix('source sourceme-f1-manager.sh'), \
+            prefix('source sourceme-f1-manager.sh --skip-ssh-setup'), \
            prefix('cd sim/'):
            run(self.build_config.make_recipe("PLATFORM=vitis driver"))

--- a/deploy/runtools/firesim_topology_elements.py
+++ b/deploy/runtools/firesim_topology_elements.py
@ -5,7 +5,7 @@ from __future__ import annotations
 import logging
 import abc
 from fabric.contrib.project import rsync_project # type: ignore
-from fabric.api import run, local, warn_only, get # type: ignore
+from fabric.api import run, local, warn_only, get, put, cd, hide # type: ignore
 from fabric.exceptions import CommandTimeout # type: ignore

 from runtools.switch_model_config import AbstractSwitchToSwitchConfig
@ -253,7 +253,7 @@ class FireSimServerNode(FireSimNode):

    def allocate_nbds(self) -> None:
        """ called by the allocate nbds pass to assign an nbd to a qcow2 image. """
-        rootfses_list = [self.get_rootfs_name()]
+        rootfses_list = self.get_all_rootfs_names()
        for rootfsname in rootfses_list:
            if rootfsname is not None and rootfsname.endswith(".qcow2"):
                host_inst = self.get_host_instance()
@ -312,13 +312,66 @@ class FireSimServerNode(FireSimNode):

        return runcommand

+    def get_local_job_results_dir_path(self) -> str:
+        """ Return local job results directory path. e.g.:
+        results-workload/workloadname/jobname/
+        """
+        jobinfo = self.get_job()
+        job_results_dir = self.get_job().parent_workload.job_results_dir
+        job_dir = f"{job_results_dir}/{jobinfo.jobname}/"
+        return job_dir
+
+    def get_local_job_monitoring_file_path(self) -> str:
+        """ Return local job monitoring file path. e.g.:
+        results-workload/workloadname/.monitoring-dir/jobname
+        """
+        jobinfo = self.get_job()
+        job_monitoring_dir = self.get_job().parent_workload.job_monitoring_dir
+        job_monitoring_file = """{}/{}""".format(job_monitoring_dir, jobinfo.jobname)
+        return job_monitoring_file
+
+    def write_job_complete_file(self) -> None:
+        """ Write file that signals to monitoring flow that job is complete. """
+        with open(self.get_local_job_monitoring_file_path(), 'w') as lfile:
+            lfile.write("Done\n")
+
+    def mkdir_and_prep_local_job_results_dir(self) -> None:
+        """ Mkdir local job results directory and write any pre-sim metadata.
+        """
+        job_dir = self.get_local_job_results_dir_path()
+        localcap = local("""mkdir -p {}""".format(job_dir), capture=True)
+        rootLogger.debug("[localhost] " + str(localcap))
+        rootLogger.debug("[localhost] " + str(localcap.stderr))
+
+        # add hw config summary per job
+        localcap = local("""echo "{}" > {}/HW_CFG_SUMMARY""".format(str(self.server_hardware_config), job_dir), capture=True)
+        rootLogger.debug("[localhost] " + str(localcap))
+        rootLogger.debug("[localhost] " + str(localcap.stderr))
+
+    def write_script(self, script_name, command) -> str:
+        """ Write a script named script_name to the local job results dir with
+        shebang + command + newline. Return the full local path."""
+        job_dir = self.get_local_job_results_dir_path()
+        script_path = job_dir + script_name
+
+        with open(script_path, 'w') as lfile:
+            lfile.write("#!/usr/bin/env bash\n")
+            lfile.write(command)
+            lfile.write("\n")
+
+        return script_path
+
+    def write_sim_start_script(self, slotno: int, sudo: bool) -> str:
+        """ Write sim-run.sh script to local job results dir and return its
+        path. """
+        start_cmd = self.get_sim_start_command(slotno, sudo)
+        sim_start_script_local_path = self.write_script("sim-run.sh", start_cmd)
+        return sim_start_script_local_path
+
    def copy_back_job_results_from_run(self, slotno: int, sudo: bool) -> None:
        """
-        1) Make the local directory for this job's output
-        2) Copy back UART log
-        3) Mount rootfs on the remote node and copy back files
-
-        TODO: move this somewhere else, it's kinda in a weird place...
+        1) Copy back UART log
+        2) Mount rootfs on the remote node and copy back files
        """
        assert self.has_assigned_host_instance(), "copy requires assigned host instance"

@ -331,20 +384,12 @@ class FireSimServerNode(FireSimNode):
        ])

        jobinfo = self.get_job()
-        simserverindex = slotno
-        job_results_dir = self.get_job().parent_workload.job_results_dir
-        job_dir = """{}/{}/""".format(job_results_dir, jobinfo.jobname)
+        job_dir = self.get_local_job_results_dir_path()

-        localcap = local("""mkdir -p {}""".format(job_dir), capture=True)
-        rootLogger.debug("[localhost] " + str(localcap))
-        rootLogger.debug("[localhost] " + str(localcap.stderr))
-
-        # add hw config summary per job
-        localcap = local("""echo "{}" > {}/HW_CFG_SUMMARY""".format(str(self.server_hardware_config), job_dir), capture=True)
-        rootLogger.debug("[localhost] " + str(localcap))
-        rootLogger.debug("[localhost] " + str(localcap.stderr))
+        self.write_job_complete_file()

        dest_sim_dir = self.get_host_instance().get_sim_dir()
+        dest_sim_slot_dir = f"{dest_sim_dir}/sim_slot_{slotno}/"

        def mount(img: str, mnt: str, tmp_dir: str) -> None:
            if sudo:
@ -371,7 +416,7 @@ class FireSimServerNode(FireSimNode):
        rfsname = self.get_rootfs_name()
        if rfsname is not None:
            is_qcow2 = rfsname.endswith(".qcow2")
-            mountpoint = """{}/sim_slot_{}/mountpoint""".format(dest_sim_dir, simserverindex)
+            mountpoint = dest_sim_slot_dir + "mountpoint"
            
            run("""{} mkdir -p {}""".format("sudo" if sudo else "", mountpoint))

@ -382,10 +427,10 @@ class FireSimServerNode(FireSimNode):
                assert nbd_tracker is not None
                rfsname = nbd_tracker.get_nbd_for_imagename(rfsname)
            else:
-                rfsname = """{}/sim_slot_{}/{}""".format(dest_sim_dir, simserverindex, rfsname)
+                rfsname = dest_sim_slot_dir + rfsname

-            mount(rfsname, mountpoint, f"{dest_sim_dir}/sim_slot_{simserverindex}")
-            with warn_only():
+            mount(rfsname, mountpoint, dest_sim_slot_dir)
+            with warn_only(), hide('warnings'):
                # ignore if this errors. not all rootfses have /etc/sysconfig/nfs
                run("""{} chattr -i {}/etc/sysconfig/nfs""".format("sudo" if sudo else "", mountpoint))

@ -402,7 +447,7 @@ class FireSimServerNode(FireSimNode):
                    rootLogger.debug(rsync_cap.stderr)

            ## unmount
-            umount(mountpoint, f"{dest_sim_dir}/sim_slot_{simserverindex}")
+            umount(mountpoint, dest_sim_slot_dir)

            ## if qcow2, detach .qcow2 image from the device, we're done with it
            if is_qcow2:
@ -411,7 +456,7 @@ class FireSimServerNode(FireSimNode):

        ## copy output files generated by the simulator that live on the host:
        ## e.g. uartlog, memory_stats.csv, etc
-        remote_sim_run_dir = """{}/sim_slot_{}/""".format(dest_sim_dir, simserverindex)
+        remote_sim_run_dir = dest_sim_slot_dir
        for simoutputfile in jobinfo.simoutputs:
            with warn_only():
                rsync_cap = rsync_project(remote_dir=remote_sim_run_dir + simoutputfile,
@ -481,6 +526,15 @@ class FireSimServerNode(FireSimNode):
            # cases
            return self.get_job_name() + "-" + rootfs_path.split("/")[-1]

+    def get_all_rootfs_names(self) -> List[Optional[str]]:
+        """ Get all rootfs filenames as a list. """
+        return [self.get_rootfs_name()]
+
+    def qcow2_support_required(self) -> bool:
+        """ Return True iff any rootfses for this sim require QCOW2 support, as
+        determined by their filename ending (.qcow2). """
+        return any(map(lambda x: x is not None and x.endswith(".qcow2"), self.get_all_rootfs_names()))
+
    def get_bootbin_name(self) -> str:
        # prefix bootbin name with the job name to disambiguate in supernode
        # cases
@ -513,21 +567,6 @@ class FireSimSuperNodeServerNode(FireSimServerNode):
            sib.assign_host_instance(super_server_host)
            sib.copy_back_job_results_from_run(slotno, sudo)

-    def allocate_nbds(self) -> None:
-        """ called by the allocate nbds pass to assign an nbd to a qcow2 image.
-        """
-        num_siblings = self.supernode_get_num_siblings_plus_one()
-
-        rootfses_list = [self.get_rootfs_name()] + [self.supernode_get_sibling(x).get_rootfs_name() for x in range(1, num_siblings)]
-
-        for rootfsname in rootfses_list:
-            if rootfsname is not None and rootfsname.endswith(".qcow2"):
-                host_inst = self.get_host_instance()
-                assert isinstance(host_inst.instance_deploy_manager, EC2InstanceDeployManager)
-                nbd_tracker = host_inst.instance_deploy_manager.nbd_tracker
-                assert nbd_tracker is not None
-                allocd_device = nbd_tracker.get_nbd_for_imagename(rootfsname)
-
    def supernode_get_num_siblings_plus_one(self) -> int:
        """ This returns the number of siblings the supernodeservernode has,
        plus one (because in most places, we use siblings + 1, not just siblings)
@ -554,6 +593,11 @@ class FireSimSuperNodeServerNode(FireSimServerNode):
                return node
        assert False, "Should return supernode sibling"

+    def get_all_rootfs_names(self) -> List[Optional[str]]:
+        """ Get all rootfs filenames as a list. """
+        num_siblings = self.supernode_get_num_siblings_plus_one()
+        return [self.get_rootfs_name()] + [self.supernode_get_sibling(x).get_rootfs_name() for x in range(1, num_siblings)]
+
    def get_sim_start_command(self, slotno: int, sudo: bool) -> str:
        """ get the command to run a simulation. assumes it will be
        called in a directory where its required_files are already located."""
@ -570,7 +614,7 @@ class FireSimSuperNodeServerNode(FireSimServerNode):
        assert self.plusarg_passthrough is not None

        all_macs = [self.get_mac_address()] + [self.supernode_get_sibling(x).get_mac_address() for x in range(1, num_siblings)]
-        all_rootfses = self.process_qcow2_rootfses([self.get_rootfs_name()] + [self.supernode_get_sibling(x).get_rootfs_name() for x in range(1, num_siblings)])
+        all_rootfses = self.process_qcow2_rootfses(self.get_all_rootfs_names())
        all_bootbins = [self.get_bootbin_name()] + [self.supernode_get_sibling(x).get_bootbin_name() for x in range(1, num_siblings)]
        all_linklatencies = [self.server_link_latency]
        for x in range(1, num_siblings):
--- a/deploy/runtools/firesim_topology_with_passes.py
+++ b/deploy/runtools/firesim_topology_with_passes.py
@ -7,6 +7,7 @@ import os
 import pprint
 import logging
 import datetime
+import sys
 from fabric.api import env, parallel, execute, run, local, warn_only # type: ignore
 from colorama import Fore, Style # type: ignore
 from functools import reduce
@ -26,10 +27,40 @@ rootLogger = logging.getLogger()

@parallel
 def instance_liveness() -> None:
-    """ Confirm that all instances are accessible (are running and can be ssh'ed into) first so that we don't run any
-    actual firesim-related commands on only some of the run farm machines."""
+    """ Confirm that all instances are accessible (are running and can be
+    ssh'ed into) first so that we don't run any actual firesim-related commands
+    on only some of the run farm machines.
+
+    Also confirm that the default shell in use is one that is known to handle
+    commands we pass to run() in the manager. The default shell must be able to
+    handle our command strings because it is always the first to interpret the
+    command string, even if the command string starts with /bin/bash.
+
+    To my knowledge, it is not possible to specify a different shell for
+    a specific instance of ssh-ing into a machine. The only way to control what
+    shell the command is handed to is to set the default shell. As reported in:
+    https://serverfault.com/questions/162018/force-ssh-to-use-a-specific-shell
+
+    For shell handling, this function will do the following:
+    a) For known good shells (specified in "allowed_shells"), continue normally.
+    b) For known bad shells (specified in "disallowed_shells"), report error and
+        exit immediately.
+    c) For unknown shells, print a warning and continue normally.
+    """
    rootLogger.info("""[{}] Checking if host instance is up...""".format(env.host_string))
    run("uname -a")
+    collect = run("echo $SHELL")
+
+    allowed_shells = ["bash"]
+    disallowed_shells = ["csh"]
+
+    shell_info = collect.stdout.split("/")[-1]
+    if shell_info in allowed_shells:
+        return
+    if shell_info in disallowed_shells:
+        rootLogger.error(f"::ERROR:: Invalid default shell in use: {shell_info}. Allowed shells: {allowed_shells}.")
+        sys.exit(1)
+    rootLogger.warning(f"::WARNING:: Unknown default shell in use: {shell_info}. Allowed shells: {allowed_shells}. You are using a default shell that has not yet been tested to correctly interpret the commands run by the FireSim manager. Proceed at your own risk. If you find that your shell works correctly, please file an issue on the FireSim repo (https://github.com/firesim/firesim/issues) so that we can add your shell to the list of known good shells.")

 class FireSimTopologyWithPasses:
    """ This class constructs a FireSimTopology, then performs a series of passes
@ -363,11 +394,14 @@ class FireSimTopologyWithPasses:
    def pass_build_required_drivers(self) -> None:
        """ Build all simulation drivers. The method we're calling here won't actually
        repeat the build process more than once per run of the manager. """
-        servers = self.firesimtopol.get_dfs_order_servers()

+        def build_drivers_helper(servers: List[FireSimServerNode]) -> None:
            for server in servers:
                server.get_resolved_server_hardware_config().build_sim_driver()

+        servers = self.firesimtopol.get_dfs_order_servers()
+        execute(build_drivers_helper, servers, hosts=['localhost'])
+
    def pass_build_required_switches(self) -> None:
        """ Build all the switches required for this simulation. """
        # the way the switch models are designed, this requires hosts to be
@ -478,6 +512,11 @@ class FireSimTopologyWithPasses:
        rootLogger.debug("[localhost] " + str(localcap))
        rootLogger.debug("[localhost] " + str(localcap.stderr))

+        rootLogger.debug("""Creating the directory: {}""".format(self.workload.job_monitoring_dir))
+        localcap = local("""mkdir -p {}""".format(self.workload.job_monitoring_dir), capture=True)
+        rootLogger.debug("[localhost] " + str(localcap))
+        rootLogger.debug("[localhost] " + str(localcap.stderr))
+
        # boot up as usual
        self.boot_simulation_passes(False, skip_instance_binding=True)

@ -573,8 +612,8 @@ class FireSimTopologyWithPasses:
            def get_jobs_completed_local_info():
                # this is a list of jobs completed, since any completed job will have
                # a directory within this directory.
-                jobscompleted = os.listdir(self.workload.job_results_dir)
-                rootLogger.debug("dir based jobs completed: " + str(jobscompleted))
+                jobscompleted = os.listdir(self.workload.job_monitoring_dir)
+                rootLogger.debug("Monitoring dir jobs completed: " + str(jobscompleted))
                return jobscompleted

            jobscompleted = get_jobs_completed_local_info()
--- a/deploy/runtools/run_farm.py
+++ b/deploy/runtools/run_farm.py
@ -107,6 +107,10 @@ class Inst(metaclass=abc.ABCMeta):
        self.sim_slots.append(firesimservernode)
        firesimservernode.assign_host_instance(self)

+    def qcow2_support_required(self) -> bool:
+        """ Return True iff any simulation on this Inst requires qcow2. """
+        return any([x.qcow2_support_required() for x in self.sim_slots])
+
 class RunFarm(metaclass=abc.ABCMeta):
    """Abstract class to represent how to manage run farm hosts (similar to `BuildFarm`).
    In addition to having to implement how to spawn/terminate nodes, the child classes must
--- a/deploy/runtools/run_farm_deploy_managers.py
+++ b/deploy/runtools/run_farm_deploy_managers.py
@ -101,11 +101,11 @@ class InstanceDeployManager(metaclass=abc.ABCMeta):
        rootLogger.info("""[{}] """.format(env.host_string) + logstr)

    def sim_node_qcow(self) -> None:
-        """ If NBD is available, install qemu-img management tools and copy NBD
-        infra to remote node. This assumes that the kernel module was already
-        built and exists in the directory on this machine.
-        """
-        if self.nbd_tracker is not None:
+        """ If NBD is available and qcow2 support is required, install qemu-img
+        management tools and copy NBD infra to remote node. This assumes that
+        the kernel module was already built and exists in the directory on this
+        machine. """
+        if self.nbd_tracker is not None and self.parent_node.qcow2_support_required():
            self.instance_logger("""Setting up remote node for qcow2 disk images.""")
            # get qemu-nbd
            ### XXX Centos Specific
@ -114,16 +114,18 @@ class InstanceDeployManager(metaclass=abc.ABCMeta):
            put('../build/nbd.ko', '/home/centos/nbd.ko', mirror_local_mode=True)

    def load_nbd_module(self) -> None:
-        """ If NBD is available, load the nbd module. always unload the module
-        first to ensure it is in a clean state. """
-        if self.nbd_tracker is not None:
+        """ If NBD is available and qcow2 support is required, load the nbd
+        module. always unload the module first to ensure it is in a clean
+        state. """
+        if self.nbd_tracker is not None and self.parent_node.qcow2_support_required():
            self.instance_logger("Loading NBD Kernel Module.")
            self.unload_nbd_module()
            run("""sudo insmod /home/centos/nbd.ko nbds_max={}""".format(self.nbd_tracker.NBDS_MAX))

    def unload_nbd_module(self) -> None:
-        """ If NBD is available, unload the nbd module. """
-        if self.nbd_tracker is not None:
+        """ If NBD is available and qcow2 support is required, unload the nbd
+        module. """
+        if self.nbd_tracker is not None and self.parent_node.qcow2_support_required():
            self.instance_logger("Unloading NBD Kernel Module.")

            # disconnect all /dev/nbdX devices before rmmod
@ -132,8 +134,9 @@ class InstanceDeployManager(metaclass=abc.ABCMeta):
                run('sudo rmmod nbd')

    def disconnect_all_nbds_instance(self) -> None:
-        """ If NBD is available, disconnect all nbds on the instance. """
-        if self.nbd_tracker is not None:
+        """ If NBD is available and qcow2 support is required, disconnect all
+        nbds on the instance. """
+        if self.nbd_tracker is not None and self.parent_node.qcow2_support_required():
            self.instance_logger("Disconnecting all NBDs.")

            # warn_only, so we can call this even if there are no nbds
@ -203,8 +206,15 @@ class InstanceDeployManager(metaclass=abc.ABCMeta):
            remote_sim_dir = """{}/sim_slot_{}/""".format(remote_home_dir, slotno)
            assert slotno < len(self.parent_node.sim_slots)
            server = self.parent_node.sim_slots[slotno]
+
+            # make the local job results dir for this sim slot
+            server.mkdir_and_prep_local_job_results_dir()
+            sim_start_script_local_path = server.write_sim_start_script(slotno, has_sudo())
+            put(sim_start_script_local_path, remote_sim_dir)
+
            with cd(remote_sim_dir):
-                run(server.get_sim_start_command(slotno, has_sudo()))
+                run("chmod +x sim-run.sh")
+                run("./sim-run.sh")


    def kill_switch_slot(self, switchslot: int) -> None:
--- a/deploy/runtools/runtime_config.py
+++ b/deploy/runtools/runtime_config.py
@ -10,7 +10,7 @@ import logging
 import yaml
 import os
 import sys
-from fabric.api import prefix, settings, local # type: ignore
+from fabric.api import prefix, settings, local, run # type: ignore

 from awstools.awstools import aws_resource_names
 from awstools.afitools import get_firesim_tagval_for_agfi
@ -20,6 +20,8 @@ from runtools.run_farm import RunFarm
 from runtools.simulation_data_classes import TracerVConfig, AutoCounterConfig, HostDebugConfig, SynthPrintConfig
 from util.inheritors import inheritors
 from util.deepmerge import deep_merge
+from util.streamlogger import InfoStreamLogger
+from buildtools.bitbuilder import get_deploy_dir

 from typing import Optional, Dict, Any, List, Sequence, Tuple, TYPE_CHECKING
 import argparse # this is not within a if TYPE_CHECKING: scope so the `register_task` in FireSim can evaluate it's annotation
@ -246,25 +248,17 @@ class RuntimeHWConfig:
        target_config = triplet_pieces[1]
        platform_config = triplet_pieces[2]
        rootLogger.info(f"Building {self.driver_type_message} driver for {str(self.get_deploytriplet_for_config())}")
-        with prefix('cd ../'), \
-             prefix('export RISCV={}'.format(os.getenv('RISCV', ""))), \
-             prefix('export PATH={}'.format(os.getenv('PATH', ""))), \
-             prefix('export LD_LIBRARY_PATH={}'.format(os.getenv('LD_LIBRARY_PATH', ""))), \
-             prefix('source ./sourceme-f1-manager.sh'), \
-             prefix('cd sim/'), \
-             prefix('set -o pipefail'):
-            localcap = None
-            with settings(warn_only=True):
-                # the local driver dir must already exist for the tee to always
-                # work
-                local("""mkdir -p {}""".format(self.get_local_driver_dir()))
-                buildlogfile = """{}firesim-manager-make-{}-temp-output-log""".format(self.get_local_driver_dir(), self.driver_build_target)
-                driverbuildcommand = """make DESIGN={} TARGET_CONFIG={} PLATFORM_CONFIG={} PLATFORM={} {}""" .format(design, target_config, platform_config, self.platform, self.driver_build_target)
-                driverbuildcommand_full = driverbuildcommand + """ 2>&1 | tee {}""".format(buildlogfile)
-                localcap = local(driverbuildcommand_full)
-                logcapture = local("""cat {}""".format(buildlogfile), capture=True)
-            rootLogger.debug("[localhost] " + str(logcapture))
-            if localcap.failed:
+
+        with InfoStreamLogger('stdout'), prefix(f'cd {get_deploy_dir()}/../'), \
+            prefix(f'export RISCV={os.getenv("RISCV", "")}'), \
+            prefix(f'export PATH={os.getenv("PATH", "")}'), \
+            prefix(f'export LD_LIBRARY_PATH={os.getenv("LD_LIBRARY_PATH", "")}'), \
+            prefix('source sourceme-f1-manager.sh --skip-ssh-setup'), \
+            prefix('cd sim/'):
+            driverbuildcommand = f"make DESIGN={design} TARGET_CONFIG={target_config} PLATFORM_CONFIG={platform_config} PLATFORM={self.platform} {self.driver_build_target}"
+            buildresult = run(driverbuildcommand)
+
+            if buildresult.failed:
                rootLogger.info(f"{self.driver_type_message} driver build failed. Exiting. See log for details.")
                rootLogger.info("""You can also re-run '{}' in the 'firesim/sim' directory to debug this error.""".format(driverbuildcommand))
                sys.exit(1)
@ -311,6 +305,8 @@ class RuntimeBuildRecipeConfig(RuntimeHWConfig):
        self.metasimulation_only_plusargs = metasimulation_only_plusargs
        self.metasimulation_only_vcs_plusargs = metasimulation_only_vcs_plusargs

+        self.additional_required_files = []
+
    def get_boot_simulation_command(self,
            slotid: int,
            all_macs: Sequence[MacAddress],
--- a/deploy/runtools/utils.py
+++ b/deploy/runtools/utils.py
@ -7,14 +7,14 @@ import logging
 from os import fspath
 from os.path import realpath
 from pathlib import Path
-from fabric.api import run, warn_only # type: ignore
+from fabric.api import run, warn_only, hide # type: ignore

 from typing import List, Tuple, Type

 rootLogger = logging.getLogger()

 def has_sudo() -> bool:
-    with warn_only():
+    with warn_only(), hide('warnings'):
        return run("sudo -ln true").return_code == 0

 def get_local_shared_libraries(elf: str) -> List[Tuple[str, str]]:
--- a/deploy/runtools/workload.py
+++ b/deploy/runtools/workload.py
@ -98,6 +98,7 @@ class WorkloadConfig:
    jobs: List[JobConfig]
    post_run_hook: str
    job_results_dir: str
+    job_monitoring_dir: str

    def __init__(self, workloadfilename: str, launch_time: str, suffixtag: str) -> None:
        self.workloadfilename = self.workloadinputs + workloadfilename
@ -140,6 +141,8 @@ class WorkloadConfig:
                                                            launch_time,
                                                            self.workload_name,
                                                            appendsuffix)
+        # hidden dir to keep job monitoring information
+        self.job_monitoring_dir = self.job_results_dir + ".monitoring-dir/"

        #import code
        #code.interact(local=locals())
--- a/docs/Developer-Docs/Manager-Development.rst
+++ b/docs/Developer-Docs/Manager-Development.rst
@ -0,0 +1,27 @@
+Manager Development
+=======================================================
+
+Writing PyTests
+++++++++++++++++
+
+PyTests for the FireSim manager are located in :gh-file-ref:`deploy/tests`.
+To write a PyTest, please refer to https://docs.pytest.org/en/7.1.x/.
+
+Running PyTests Locally
+++++++++++++++++++++++
+
+Assuming the FireSim repository is setup properly, PyTests can be run by doing the following:
+
+::
+
+    cd <FireSim Root>
+    cd deploy/
+    pytest
+
+By default this will run all PyTests.
+
+Adding PyTests To CI
+++++++++++++++++++++++
+
+By default all PyTests are run by CI using the same command shown in the prior section.
+This can be seen in https://github.com/firesim/firesim/blob/d16969b984df6d0cb5cd3e8ed27d89d03095a180/.github/workflows/firesim-run-tests.yml#L147-L156 and :gh-file-ref:`.github/scripts/run-manager-pytests.py`.
--- a/docs/index.rst
+++ b/docs/index.rst
@ -53,6 +53,7 @@ New to FireSim? Jump to the :ref:`firesim-basics` page for more info.
   Developer-Docs/Host-Platform-Debugging
   Developer-Docs/VSCode-Integration
   Developer-Docs/Managing-Conda-Lock-File
+   Developer-Docs/Manager-Development

 .. toctree::
   :maxdepth: 3
--- a/scripts/fix-open-files.sh
+++ b/scripts/fix-open-files.sh
@ -0,0 +1,12 @@
+
+# first, check if the system allows sufficient limits (the hard limit)
+HARD_LIMIT=$(ulimit -Hn)
+REQUIRED_LIMIT=16384
+
+if [ "$HARD_LIMIT" -lt "$REQUIRED_LIMIT" ]; then
+    echo "WARNING: Your system does not support an open files limit (the output of 'ulimit -Sn' and 'ulimit -Hn') of at least $REQUIRED_LIMIT, which is required to workaround a bug in buildroot. You will not be able to build a Linux distro with FireMarshal until this is addressed."
+fi
+
+# in any case, set the soft limit to the same value as the hard limit
+ulimit -Sn $(ulimit -Hn)
+
--- a/scripts/machine-launch-script.sh
+++ b/scripts/machine-launch-script.sh
@ -277,6 +277,9 @@ set -o pipefail
    fi
    "${DRY_RUN_ECHO[@]}" $SUDO "${CONDA_ENV_BIN}/activate-global-python-argcomplete" "${argcomplete_extra_args[@]}"

+    # emergency fix for buildroot open files limit issue on centos:
+    echo "* hard nofile 16384" | sudo tee --append /etc/security/limits.conf
+
 } 2>&1 | tee machine-launchstatus.log
 chmod ugo+r machine-launchstatus.log

--- a/sourceme-f1-full.sh
+++ b/sourceme-f1-full.sh
@ -1,6 +1,37 @@
 # you should source this only if you plan to run build/simulations locally,
 # without using the manager at all.

+DO_SSH_SETUP=true
+
+function usage
+{
+    echo "usage: source sourceme-f1-full.sh [OPTIONS]"
+    echo "options:"
+    echo "   --skip-ssh-setup: if set, skips ssh setup checks."
+}
+
+while test $# -gt 0
+do
+   case "$1" in
+        --skip-ssh-setup)
+            DO_SSH_SETUP=false;
+            ;;
+        -h | -H | --help)
+            usage
+            exit
+            ;;
+        --*) echo "ERROR: bad option $1"
+            usage
+            exit 1
+            ;;
+        *) echo "ERROR: bad argument $1"
+            usage
+            exit 2
+            ;;
+    esac
+    shift
+done
+
 unamestr=$(uname)
 RDIR=$(pwd)
 AWSFPGA=$RDIR/platforms/f1/aws-fpga
@ -18,8 +49,10 @@ cd $RDIR
 # put the manager on the user path
 export PATH=$PATH:$(pwd)/deploy

-# setup ssh-agent
-source deploy/ssh-setup.sh
+if [ "$DO_SSH_SETUP" = true ]; then
+    # setup ssh-agent
+    source deploy/ssh-setup.sh
+fi

 # flag for scripts to check that this has been sourced
 export FIRESIM_SOURCED=1
--- a/sourceme-f1-manager.sh
+++ b/sourceme-f1-manager.sh
@ -2,6 +2,37 @@
 # you can also source it in your bashrc, but you must cd to this directory
 # first

+DO_SSH_SETUP=true
+
+function usage
+{
+    echo "usage: source sourceme-f1-manager.sh [OPTIONS]"
+    echo "options:"
+    echo "   --skip-ssh-setup: if set, skips ssh setup checks."
+}
+
+while test $# -gt 0
+do
+   case "$1" in
+        --skip-ssh-setup)
+            DO_SSH_SETUP=false;
+            ;;
+        -h | -H | --help)
+            usage
+            exit
+            ;;
+        --*) echo "ERROR: bad option $1"
+            usage
+            exit 1
+            ;;
+        *) echo "ERROR: bad argument $1"
+            usage
+            exit 2
+            ;;
+    esac
+    shift
+done
+
 unamestr=$(uname)
 RDIR=$(pwd)
 AWSFPGA=$RDIR/platforms/f1/aws-fpga
@ -13,8 +44,10 @@ source ./env.sh
 # put the manager on the user path
 export PATH=$PATH:$(pwd)/deploy

-# setup ssh-agent
-source deploy/ssh-setup.sh
+if [ "$DO_SSH_SETUP" = true ]; then
+    # setup ssh-agent
+    source deploy/ssh-setup.sh
+fi

 # flag for scripts to check that this has been sourced
 export FIRESIM_SOURCED=1
--- a/target-design/chipyard
+++ b/target-design/chipyard
@ -1 +1 @@
-Subproject commit 2d03d10837fc1a7569d10ae2c91c8132c7fee77d
+Subproject commit 640d159499d16a388b6dc2d7277be28365a2a536