Move VCS metasims to run on on-premise FPGA machine

This commit is contained in:
abejgonzalez 2023-01-19 21:45:02 -08:00 committed by Mergify
parent c43df3f3ab
commit 7a3349901f
3 changed files with 95 additions and 5 deletions

75
.github/scripts/run-parallel-vcs-metasims.py vendored Executable file
View File

@ -0,0 +1,75 @@
#!/usr/bin/env python3
import sys
from pathlib import Path
from fabric.api import prefix, run, settings, execute # type: ignore
from common import manager_fsim_dir, set_fabric_firesim_pem
from ci_variables import ci_env
def run_parallel_metasim():
""" Runs parallel baremetal metasimulations """
# assumptions:
# - machine-launch-script requirements are already installed
# - VCS is installed
# repo should already be checked out
with prefix(f"cd {ci_env['GITHUB_WORKSPACE']}"):
run("./build-setup.sh --skip-validate")
with prefix('source sourceme-f1-manager.sh --skip-ssh-setup'):
# avoid logging excessive amounts to prevent GH-A masking secrets (which slows down log output)
with prefix('cd sw/firesim-software'):
run("./init-submodules.sh")
# build hello world baremetal test
with prefix('cd sw/firesim-software'):
with settings(warn_only=True):
rc = run("./marshal -v build test/bare.yaml &> bare.full.log").return_code
if rc != 0:
run("cat bare.full.log")
raise Exception("Building test/bare.yaml failed to run")
run("./marshal -v install test/bare.yaml")
def run_w_timeout(workload: str, timeout: str):
""" Run workload with a specific timeout
:arg: workload (str) - workload yaml (abs path)
:arg: timeout (str) - timeout amount for the workload to run
"""
log_tail_length = 100
# unique tag based on the ci workflow and filename is needed to ensure
# run farm is unique to each linux-poweroff test
script_name = Path(__file__).stem
with prefix(f"export FIRESIM_RUNFARM_PREFIX={ci_env['GITHUB_RUN_ID']}-{script_name}"):
rc = 0
with settings(warn_only=True):
# avoid logging excessive amounts to prevent GH-A masking secrets (which slows down log output)
# pty=False needed to avoid issues with screen -ls stalling in fabric
rc = run(f"timeout {timeout} ./deploy/workloads/run-workload.sh {workload} --withlaunch &> {workload}.log", pty=False).return_code
print(f"Printing last {log_tail_length} lines of log. See {workload}.log for full info.")
run(f"tail -n {log_tail_length} {workload}.log")
# This is a janky solution to the fact the manager does not
# return a non-zero exit code or some sort of result summary.
# The expectation here is that the PR author will manually
# check these output files for correctness until it can be
# done programmatically..
print(f"Printing last {log_tail_length} lines of all output files. See results-workload for more info.")
run(f"""cd deploy/results-workload/ && LAST_DIR=$(ls | tail -n1) && if [ -d "$LAST_DIR" ]; then tail -n{log_tail_length} $LAST_DIR/*/*; fi""")
if rc != 0:
# need to confirm that instance is off
print(f"Workload {workload} failed. Terminating runfarm.")
run(f"firesim terminaterunfarm -q -c {workload}")
sys.exit(rc)
else:
print(f"Workload {workload} successful.")
run_w_timeout(f"{manager_fsim_dir}/deploy/workloads/hello-world-localhost-vcs-metasim.yaml", "15m")
if __name__ == "__main__":
execute(run_parallel_metasim, hosts=["localhost"])

View File

@ -59,7 +59,6 @@ def run_parallel_metasim():
print(f"Workload {workload} successful.")
run_w_timeout(f"{manager_fsim_dir}/deploy/workloads/hello-world-localhost-verilator-metasim.yaml", "15m")
run_w_timeout(f"{manager_fsim_dir}/deploy/workloads/hello-world-localhost-vcs-metasim.yaml", "15m")
run_w_timeout(f"{manager_fsim_dir}/deploy/workloads/hello-world-awsec2-verilator-metasim.yaml", "15m")
if __name__ == "__main__":

View File

@ -297,8 +297,8 @@ jobs:
test-package: "firesim.firesim"
test-name: "CITests"
run-parallel-metasims:
name: run-parallel-metasims
run-parallel-verilator-metasims:
name: run-parallel-verilator-metasims
# Building the driver can cause concurrency issues with SBT, so serialize
# this behind the scalatest train. Remove once we're off SBT.
needs: [run-chipyard-tests]
@ -307,8 +307,8 @@ jobs:
TERM: xterm-256-color
steps:
- uses: actions/checkout@v3
- name: Run parallel metasimulation tests (deploy on localhost and on AWS instances)
run: .github/scripts/run-parallel-metasims.py
- name: Run parallel Verilator metasimulation tests (deploy on localhost and on AWS instances)
run: .github/scripts/run-parallel-verilator-metasims.py
run-basic-linux-poweroff:
if: contains(github.event.pull_request.labels.*.name, 'ci:fpga-deploy')
@ -336,6 +336,22 @@ jobs:
- name: Run linux-poweroff test w/ externally provisioned (AWS EC2) run farm
run: .github/scripts/run-linux-poweroff-externally-provisioned.py
run-parallel-vcs-metasims:
name: run-parallel-vcs-metasims
runs-on: local-fpga
env:
TERM: xterm-256-color
steps:
# This forces a fresh clone of the repo during the `checkout` step
# to resolve stale submodule URLs. See https://github.com/ucb-bar/chipyard/pull/1156.
- name: Delete old checkout
run: |
rm -rf ${{ github.workspace }}/* || true
rm -rf ${{ github.workspace }}/.* || true
- uses: actions/checkout@v3
- name: Run parallel VCS metasims
run: .github/scripts/run-parallel-vcs-metasims.py
run-basic-linux-poweroff-vitis:
if: contains(github.event.pull_request.labels.*.name, 'ci:fpga-deploy')
name: run-basic-linux-poweroff-vitis