firesim/.github/scripts/run-linux-poweroff-external...

110 lines
5.5 KiB
Python
Executable File

#!/usr/bin/env python3
import sys
import time
from pathlib import Path
from fabric.api import prefix, settings, run, execute # type: ignore
from common import manager_fsim_dir, set_fabric_firesim_pem
from utils import search_match_in_last_workloads_output_file
from ci_variables import ci_env
sys.path.append(ci_env['GITHUB_WORKSPACE'] + "/deploy")
from awstools.awstools import get_instances_with_filter, get_private_ips_for_instances
from util.filelineswap import file_line_swap
def run_linux_poweroff_externally_provisioned():
""" Runs Linux poweroff workloads using externally provisioned AWS run farm """
with prefix(f"cd {manager_fsim_dir} && source sourceme-manager.sh"):
def run_w_timeout(workload_path, workload, timeout, num_passes):
""" Run workload with a specific timeout
:arg: workload_path (str) - workload abs path
:arg: workload (str) - workload yaml name
:arg: timeout (str) - timeout amount for the workload to run
"""
workload_full = workload_path + "/" + workload
log_tail_length = 300
script_name = Path(__file__).stem
rf_prefix = f"{ci_env['GITHUB_RUN_ID']}-{script_name}"
# unique tag based on the ci workflow and filename is needed to ensure
# run farm is unique to each linux-poweroff test
with prefix(f"export FIRESIM_RUNFARM_PREFIX={rf_prefix}"):
rc = 0
with settings(warn_only=True):
# do the following:
# 1. launch the run farm w/ the AWS EC2 runfarm
# 2. copy the hostnames given into a new externally provisioned runfarm/runtime file
# 4. run launch/infra/runworkload/terminate w/ that runtime
# 5. if successful or fail, run the terminate w/ the old AWS EC2 runfarm
rc = run(f"firesim launchrunfarm -c {workload_full}")
time.sleep(3 * 60) # TODO: replace w/ instance_liveness check
instances_filter = [
{'Name': 'instance-type', 'Values': ['f1.2xlarge']},
{'Name': 'tag:fsimcluster', 'Values': [f'{rf_prefix}*']},
]
instances = get_instances_with_filter(instances_filter, allowed_states=["running"])
instance_ips = [instance['PrivateIpAddress'] for instance in instances]
start_lines = [f" base_recipe: run-farm-recipes/externally_provisioned.yaml\n"]
start_lines += [" recipe_arg_overrides:\n"]
start_lines += [" run_farm_hosts_to_use:\n"]
for ip in instance_ips:
start_lines += [f""" - "centos@{ip}": one_fpga_spec\n"""]
file_line_swap(
workload_full,
f"/tmp/{workload}",
"ci replace start",
"ci replace end",
start_lines)
# avoid logging excessive amounts to prevent GH-A masking secrets (which slows down log output)
# pty=False needed to avoid issues with screen -ls stalling in fabric
rc = run(f"timeout {timeout} ./deploy/workloads/run-workload.sh /tmp/{workload} --withlaunch &> {workload}.log", pty=False).return_code
print(f" Printing last {log_tail_length} lines of log. See {workload}.log for full info.")
run(f"tail -n {log_tail_length} {workload}.log")
# This is a janky solution to the fact the manager does not
# return a non-zero exit code or some sort of result summary.
# The expectation here is that the PR author will manually
# check these output files for correctness until it can be
# done programmatically..
print(f"Printing last {log_tail_length} lines of all output files. See results-workload for more info.")
run(f"""cd deploy/results-workload/ && LAST_DIR=$(ls | tail -n1) && if [ -d "$LAST_DIR" ]; then tail -n{log_tail_length} $LAST_DIR/*/*; fi""")
run(f"firesim terminaterunfarm -q -c {workload_full}")
if rc != 0:
print(f"Workload {workload} failed.")
sys.exit(rc)
else:
print(f"Workload run {workload} successful. Checking workload files...")
def check(match_key, file_name = 'uartlog'):
out_count = search_match_in_last_workloads_output_file(file_name, match_key)
assert out_count == num_passes, f"Workload {file_name} files are malformed: '{match_key}' found {out_count} times (!= {num_passes}). Something went wrong."
# first driver completed successfully
check('*** PASSED ***')
# verify login was reached (i.e. linux booted)
check('running /etc/init.d/S99run')
# verify reaching poweroff
check('Power down')
print(f"Workload run {workload} successful.")
run_w_timeout(f"{manager_fsim_dir}/deploy/workloads", "linux-poweroff-all-no-nic.yaml", "45m", 2)
if __name__ == "__main__":
set_fabric_firesim_pem()
execute(run_linux_poweroff_externally_provisioned, hosts=["localhost"])