diff --git a/.github/scripts/run-parallel-metasims.py b/.github/scripts/run-parallel-metasims.py new file mode 100755 index 00000000..c9f8b28b --- /dev/null +++ b/.github/scripts/run-parallel-metasims.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 + +import sys +from pathlib import Path + +from fabric.api import prefix, settings, run, execute # type: ignore + +from common import manager_fsim_dir, set_fabric_firesim_pem +from ci_variables import ci_env + +def run_parallel_metasim(): + """ Runs parallel baremetal metasimulations """ + + with prefix(f"cd {manager_fsim_dir} && source sourceme-f1-manager.sh"): + + # build hello world baremetal test + with prefix('cd sw/firesim-software'): + with settings(warn_only=True): + rc = run("./marshal -v build test/bare.yaml &> bare.full.log").return_code + if rc != 0: + run("cat bare.full.log") + raise Exception("Building test/bare.yaml failed to run") + + run("./marshal -v install test/bare.yaml") + + def run_w_timeout(workload: str, timeout: str): + """ Run workload with a specific timeout + + :arg: workload (str) - workload yaml (abs path) + :arg: timeout (str) - timeout amount for the workload to run + """ + log_tail_length = 100 + # unique tag based on the ci workflow and filename is needed to ensure + # run farm is unique to each linux-poweroff test + script_name = Path(__file__).stem + with prefix(f"export FIRESIM_RUNFARM_PREFIX={ci_env['GITHUB_RUN_ID']}-{script_name}"): + rc = 0 + with settings(warn_only=True): + # avoid logging excessive amounts to prevent GH-A masking secrets (which slows down log output) + # pty=False needed to avoid issues with screen -ls stalling in fabric + rc = run(f"timeout {timeout} ./deploy/workloads/run-workload.sh {workload} --withlaunch &> {workload}.log", pty=False).return_code + print(f"Printing last {log_tail_length} lines of log. See {workload}.log for full info.") + run(f"tail -n {log_tail_length} {workload}.log") + + # This is a janky solution to the fact the manager does not + # return a non-zero exit code or some sort of result summary. + # The expectation here is that the PR author will manually + # check these output files for correctness until it can be + # done programmatically.. + print(f"Printing last {log_tail_length} lines of all output files. See results-workload for more info.") + run(f"""cd deploy/results-workload/ && LAST_DIR=$(ls | tail -n1) && if [ -d "$LAST_DIR" ]; then tail -n{log_tail_length} $LAST_DIR/*/*; fi""") + + if rc != 0: + # need to confirm that instance is off + print(f"Workload {workload} failed. Terminating runfarm.") + run(f"firesim terminaterunfarm -q -c {workload}") + sys.exit(rc) + else: + print(f"Workload {workload} successful.") + + run_w_timeout(f"{manager_fsim_dir}/deploy/workloads/hello-world-localhost-metasim.yaml", "15m") + run_w_timeout(f"{manager_fsim_dir}/deploy/workloads/hello-world-awsec2-metasim.yaml", "15m") + +if __name__ == "__main__": + set_fabric_firesim_pem() + execute(run_parallel_metasim, hosts=["localhost"]) diff --git a/.github/workflows/firesim-run-tests.yml b/.github/workflows/firesim-run-tests.yml index bf0cadfd..3e50bc5e 100644 --- a/.github/workflows/firesim-run-tests.yml +++ b/.github/workflows/firesim-run-tests.yml @@ -297,6 +297,19 @@ jobs: test-package: "firesim.firesim" test-name: "CITests" + run-parallel-metasims: + name: run-parallel-metasims + # Building the driver can cause concurrency issues with SBT, so serialize + # this behind the scalatest train. Remove once we're off SBT. + needs: [run-chipyard-tests] + runs-on: aws-${{ github.run_id }} + env: + TERM: xterm-256-color + steps: + - uses: actions/checkout@v3 + - name: Run parallel metasimulation tests (deploy on localhost and on AWS instances) + run: .github/scripts/run-parallel-metasims.py + run-basic-linux-poweroff: if: contains(github.event.pull_request.labels.*.name, 'ci:fpga-deploy') name: run-basic-linux-poweroff diff --git a/deploy/runtools/runtime_config.py b/deploy/runtools/runtime_config.py index 063ee407..1057bd77 100644 --- a/deploy/runtools/runtime_config.py +++ b/deploy/runtools/runtime_config.py @@ -289,9 +289,9 @@ class RuntimeHWConfig: self.driver_built = True def build_sim_tarball(self, paths: List[Tuple[str, str]], tarball_name: str) -> None: - """ Take the simulation driver and tar it. build_sim_driver() - must run before this function. Rsync is used in a mode where it's copying - from local paths to a local folder. This is confusing as rsync traditionaly is + """ Take the simulation driver and tar it. build_sim_driver() + must run before this function. Rsync is used in a mode where it's copying + from local paths to a local folder. This is confusing as rsync traditionally is used for copying from local folders to a remote folder. The variable local_remote_dir is named as a reminder that it's actually pointing at this local machine""" if self.tarball_built: @@ -300,7 +300,7 @@ class RuntimeHWConfig: # builddir is a temporary directory created by TemporaryDirectory() # the path a folder is under /tmp/ with a random name - # After this scope block exists, the entier folder is deleted + # After this scope block exists, the entire folder is deleted with TemporaryDirectory() as builddir: with InfoStreamLogger('stdout'), prefix(f'cd {get_deploy_dir()}'): @@ -317,6 +317,9 @@ class RuntimeHWConfig: self.handle_failure(results, 'local rsync', get_deploy_dir(), cmd) # This must be taken outside of a cd context + cmd = f"mkdir -p {self.local_triplet_path()}" + results = run(cmd) + self.handle_failure(results, 'local mkdir', builddir, cmd) absolute_tarball_path = self.local_triplet_path() / tarball_name with InfoStreamLogger('stdout'), prefix(f'cd {builddir}'): @@ -348,8 +351,12 @@ class RuntimeBuildRecipeConfig(RuntimeHWConfig): metasimulation_only_plusargs: str, metasimulation_only_vcs_plusargs: str) -> None: self.name = name + self.agfi = None self.xclbin = None + self.driver_tar = None + self.tarball_built = False + self.deploytriplet = build_recipe_dict['DESIGN'] + "-" + build_recipe_dict['TARGET_CONFIG'] + "-" + build_recipe_dict['PLATFORM_CONFIG'] self.customruntimeconfig = build_recipe_dict['metasim_customruntimeconfig'] diff --git a/deploy/workloads/hello-world-awsec2-metasim.yaml b/deploy/workloads/hello-world-awsec2-metasim.yaml new file mode 100644 index 00000000..b051f5b7 --- /dev/null +++ b/deploy/workloads/hello-world-awsec2-metasim.yaml @@ -0,0 +1,46 @@ +run_farm: + base_recipe: run-farm-recipes/aws_ec2.yaml + recipe_arg_overrides: + run_farm_tag: helloworldawsec2 + run_farm_hosts_to_use: + - z1d.3xlarge: 4 + +metasimulation: + metasimulation_enabled: true + metasimulation_host_simulator: verilator + metasimulation_only_plusargs: "+fesvr-step-size=128 +max-cycles=100000000" + metasimulation_only_vcs_plusargs: "+vcs+initreg+0 +vcs+initmem+0" + +target_config: + topology: no_net_config + no_net_num_nodes: 4 + link_latency: 6405 + switching_latency: 10 + net_bandwidth: 200 + profile_interval: -1 + default_hw_config: firesim_rocket_quadcore_no_nic_l2_llc4mb_ddr3 + plusarg_passthrough: "" + +tracing: + enable: no + output_format: 0 + selector: 1 + start: 0 + end: -1 + +autocounter: + read_rate: 0 + +workload: + workload_name: bare.json + terminate_on_completion: yes + suffix_tag: null + +host_debug: + zero_out_dram: no + disable_synth_asserts: no + +synth_print: + start: 0 + end: -1 + cycle_prefix: yes diff --git a/deploy/workloads/hello-world-localhost-metasim.yaml b/deploy/workloads/hello-world-localhost-metasim.yaml new file mode 100644 index 00000000..9ea57a92 --- /dev/null +++ b/deploy/workloads/hello-world-localhost-metasim.yaml @@ -0,0 +1,48 @@ +run_farm: + base_recipe: run-farm-recipes/externally_provisioned.yaml + recipe_arg_overrides: + run_farm_tag: helloworldlocalhost + default_platform: EC2InstanceDeployManager + default_simulation_dir: /home/centos + run_farm_hosts_to_use: + - localhost: four_metasims_spec + +metasimulation: + metasimulation_enabled: true + metasimulation_host_simulator: verilator + metasimulation_only_plusargs: "+fesvr-step-size=128 +max-cycles=100000000" + metasimulation_only_vcs_plusargs: "+vcs+initreg+0 +vcs+initmem+0" + +target_config: + topology: no_net_config + no_net_num_nodes: 4 + link_latency: 6405 + switching_latency: 10 + net_bandwidth: 200 + profile_interval: -1 + default_hw_config: firesim_rocket_quadcore_no_nic_l2_llc4mb_ddr3 + plusarg_passthrough: "" + +tracing: + enable: no + output_format: 0 + selector: 1 + start: 0 + end: -1 + +autocounter: + read_rate: 0 + +workload: + workload_name: bare.json + terminate_on_completion: yes + suffix_tag: null + +host_debug: + zero_out_dram: no + disable_synth_asserts: no + +synth_print: + start: 0 + end: -1 + cycle_prefix: yes