Merge pull request #1460 from firesim/al2

Support AL2 manager instances
This commit is contained in:
Abraham Gonzalez 2023-03-11 14:11:39 -08:00 committed by GitHub
commit 26bb89d5f5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 46 additions and 33 deletions

View File

@ -17,9 +17,9 @@ from ci_variables import ci_env
def wait_machine_launch_complete():
# Catch any exception that occurs so that we can gracefully teardown
with settings(warn_only=True):
rc = run("timeout 20m grep -q '.*machine launch script complete.*' <(tail -f /machine-launchstatus)").return_code
rc = run("timeout 20m grep -q '.*machine launch script complete.*' <(tail -f /tmp/machine-launchstatus)").return_code
if rc != 0:
run("cat /machine-launchstatus.log")
run("cat /tmp/machine-launchstatus.log")
raise Exception("machine-launch-script.sh failed to run")
def setup_self_hosted_runners(platform_lib: PlatformLib):

View File

@ -39,7 +39,16 @@ rootLogger = logging.getLogger()
# https://console.aws.amazon.com/ec2/v2/home?region=us-east-1#Images:visibility=public-images;search=FPGA%20Developer;sort=name
# And whenever this changes, you also need to update deploy/tests/test_amis.json
# by running scripts/update_test_amis.py
f1_ami_name = "FPGA Developer AMI - 1.12.1-40257ab5-6688-4c95-97d1-e251a40fd1fc"
# additionally, for normal use this assumes that the AMI used by the runhosts and manager instance match.
# in the case of CI (or launching instances from a non-EC2 instance), this defaults to the centos based AMI.
def get_f1_ami_name() -> str:
cuser = os.environ["USER"]
if cuser == "amzn":
return "FPGA Developer AMI(AL2) - 1.11.3-62ddb7b2-2f1e-4c38-a111-9093dcb1656f"
else:
if cuser != "centos":
print("Unknown $USER (expected centos/amzn). Defaulting to the Centos AWS EC2 AMI.")
return "FPGA Developer AMI - 1.12.1-40257ab5-6688-4c95-97d1-e251a40fd1fc"
class MockBoto3Instance:
""" This is used for testing without actually launching instances. """
@ -227,7 +236,7 @@ def get_f1_ami_id() -> str:
""" Get the AWS F1 Developer AMI by looking up the image name -- should be region independent.
"""
client = boto3.client('ec2')
response = client.describe_images(Filters=[{'Name': 'name', 'Values': [f1_ami_name]}])
response = client.describe_images(Filters=[{'Name': 'name', 'Values': [get_f1_ami_name()]}])
assert len(response['Images']) == 1
return response['Images'][0]['ImageId']

View File

@ -160,7 +160,7 @@ class RunFarm(metaclass=abc.ABCMeta):
def __init__(self, args: Dict[str, Any], metasimulation_enabled: bool) -> None:
self.args = args
self.metasimulation_enabled = metasimulation_enabled
self.default_simulation_dir = self.args.get("default_simulation_dir", "/home/centos")
self.default_simulation_dir = self.args.get("default_simulation_dir", f"/home/{os.environ['USER']}")
self.SIM_HOST_HANDLE_TO_MAX_FPGA_SLOTS = dict()
self.SIM_HOST_HANDLE_TO_MAX_METASIM_SLOTS = dict()
self.SIM_HOST_HANDLE_TO_SWITCH_ONLY_OK = dict()

View File

@ -12,6 +12,7 @@ import time
from os.path import join as pjoin
from os.path import basename, expanduser
from os import PathLike, fspath
import os
from fsspec.core import url_to_fs # type: ignore
from pathlib import Path
import hashlib
@ -80,14 +81,14 @@ class URIContainer:
return m.hexdigest()
def _resolve_vanilla_path(self, hwcfg) -> Optional[str]:
""" Allows fallback to a vanilla path. Relative paths are resolved realtive to firesim/deploy/.
""" Allows fallback to a vanilla path. Relative paths are resolved realtive to firesim/deploy/.
This will convert a vanilla path to a URI, or return None."""
uri: Optional[str] = getattr(hwcfg, self.hwcfg_prop)
# do nothing if there isn't a URI
if uri is None:
return None
# if already a URI, exit early returning unmodified string
is_uri = re.match(_RFC_3986_PATTERN, uri)
if is_uri:
@ -116,7 +117,7 @@ class URIContainer:
# choose a repeatable, path based on the hash of the URI
destination = pjoin(local_dir, self.hashed_name(uri))
return (uri, destination)
def local_pre_download(self, local_dir: str, hwcfg) -> Optional[Tuple[str, str]]:
@ -126,11 +127,11 @@ class URIContainer:
# resolve the URI and the path '/{dir}/{hash}' we should download to
both = self._choose_path(local_dir, hwcfg)
# do nothing if there isn't a URI
if both is None:
return None
(uri, destination) = both
# When it exists, return the same information, but skip the download
@ -152,11 +153,11 @@ class URIContainer:
# resolve the URI and the path '/{dir}/{hash}' we should download to
both = self._choose_path(local_dir, hwcfg)
# do nothing if there isn't a URI
if both is None:
return None
(uri, destination) = both
# because the local file has a nonsense name (the hash)
@ -237,7 +238,7 @@ class InstanceDeployManager(metaclass=abc.ABCMeta):
### XXX Centos Specific
run('sudo yum -y install qemu-img')
# copy over kernel module
put('../build/nbd.ko', '/home/centos/nbd.ko', mirror_local_mode=True)
put('../build/nbd.ko', f"/home/{os.environ['USER']}/nbd.ko", mirror_local_mode=True)
def load_nbd_module(self) -> None:
""" If NBD is available and qcow2 support is required, load the nbd
@ -246,7 +247,7 @@ class InstanceDeployManager(metaclass=abc.ABCMeta):
if self.nbd_tracker is not None and self.parent_node.qcow2_support_required():
self.instance_logger("Loading NBD Kernel Module.")
self.unload_nbd_module()
run("""sudo insmod /home/centos/nbd.ko nbds_max={}""".format(self.nbd_tracker.NBDS_MAX))
run(f"""sudo insmod /home/{os.environ['USER']}/nbd.ko nbds_max={self.nbd_tracker.NBDS_MAX}""")
def unload_nbd_module(self) -> None:
""" If NBD is available and qcow2 support is required, unload the nbd
@ -278,7 +279,7 @@ class InstanceDeployManager(metaclass=abc.ABCMeta):
""" Returns the path on the remote for a given slot number. """
remote_home_dir = self.parent_node.get_sim_dir()
remote_sim_dir = f"{remote_home_dir}/sim_slot_{slotno}/"
# so that callers can reliably concatenate folders to the returned value
assert remote_sim_dir[-1] == '/', f"Return value of get_remote_sim_dir_for_slot({slotno}) must end with '/'."
@ -294,12 +295,12 @@ class InstanceDeployManager(metaclass=abc.ABCMeta):
hwcfg = self.parent_node.sim_slots[slotno].get_resolved_server_hardware_config()
for container in self.uri_list:
container.local_pre_download(dir, hwcfg)
def get_local_uri_paths(self, slotno: int, dir: str) -> list[Tuple[str, str]]:
""" Get all paths of local URIs that were previously downloaded. """
hwcfg = self.parent_node.sim_slots[slotno].get_resolved_server_hardware_config()
ret = list()
for container in self.uri_list:
maybe_file = container.get_rsync_path(dir, hwcfg)
@ -637,7 +638,7 @@ class EC2InstanceDeployManager(InstanceDeployManager):
with warn_only():
run('git clone https://github.com/aws/aws-fpga')
run('cd aws-fpga && git checkout ' + aws_fpga_upstream_version)
with cd('/home/centos/aws-fpga'):
with cd(f"/home/{os.environ['USER']}/aws-fpga"):
run('source sdk_setup.sh')
def fpga_node_xdma(self) -> None:
@ -646,10 +647,10 @@ class EC2InstanceDeployManager(InstanceDeployManager):
"""
if self.instance_assigned_simulations():
self.instance_logger("""Copying AWS FPGA XDMA driver to remote node.""")
run('mkdir -p /home/centos/xdma/')
run(f"mkdir -p /home/{os.environ['USER']}/xdma/")
put('../platforms/f1/aws-fpga/sdk/linux_kernel_drivers',
'/home/centos/xdma/', mirror_local_mode=True)
with cd('/home/centos/xdma/linux_kernel_drivers/xdma/'), \
f"/home/{os.environ['USER']}/xdma/", mirror_local_mode=True)
with cd(f"/home/{os.environ['USER']}/xdma/linux_kernel_drivers/xdma/"), \
prefix("export PATH=/usr/bin:$PATH"):
# prefix only needed if conda env is earlier in PATH
# see build-setup-nolog.sh for explanation.
@ -738,7 +739,7 @@ class EC2InstanceDeployManager(InstanceDeployManager):
# now load xdma
self.instance_logger("Loading XDMA Driver Kernel Module.")
# TODO: can make these values automatically be chosen based on link lat
run("sudo insmod /home/centos/xdma/linux_kernel_drivers/xdma/xdma.ko poll_mode=1")
run(f"sudo insmod /home/{os.environ['USER']}/xdma/linux_kernel_drivers/xdma/xdma.ko poll_mode=1")
def start_ila_server(self) -> None:
""" start the vivado hw_server and virtual jtag on simulation instance. """
@ -807,12 +808,12 @@ class EC2InstanceDeployManager(InstanceDeployManager):
class VitisInstanceDeployManager(InstanceDeployManager):
""" This class manages a Vitis-enabled instance """
@classmethod
def sim_command_requires_sudo(cls) -> bool:
""" This sim does not require sudo. """
return False
@classmethod
def get_xclbin_filename(cls) -> str:
""" Get the name of the xclbin inside the sim_slot_X directory on the run host. """

View File

@ -84,11 +84,11 @@ before, so we need to first ssh into the instance and make sure the
setup is complete.
In either case, ``ssh`` into your instance (e.g. ``ssh -i firesim.pem centos@YOUR_INSTANCE_IP``) and wait until the
``/machine-launchstatus`` file contains all the following text:
``/tmp/machine-launchstatus`` file contains all the following text:
::
$ cat /machine-launchstatus
$ cat /tmp/machine-launchstatus
machine launch script started
machine launch script completed

View File

@ -27,7 +27,7 @@ $SCRIPT_DIR/../../deploy/awstools/awstools.py \
rm -rf machine-launch-script.sh
# make sure managerinit finishes properly
run "timeout 10m grep -q \".*machine launch script complete.*\" <(tail -f machine-launchstatus)"
run "timeout 10m grep -q \".*machine launch script complete.*\" <(tail -f /tmp/machine-launchstatus)"
# setup the repo (similar to ci)

View File

@ -1,5 +1,7 @@
#!/bin/bash
MACHINE_LAUNCH_DIR=/tmp
CONDA_INSTALL_PREFIX=/opt/conda
CONDA_INSTALLER_VERSION=4.12.0-0
CONDA_INSTALLER="https://github.com/conda-forge/miniforge/releases/download/${CONDA_INSTALLER_VERSION}/Miniforge3-${CONDA_INSTALLER_VERSION}-Linux-x86_64.sh"
@ -108,8 +110,8 @@ set -o pipefail
OS_FLAVOR=$(grep '^ID=' /etc/os-release | awk -F= '{print $2}' | tr -d '"')
OS_VERSION=$(grep '^VERSION_ID=' /etc/os-release | awk -F= '{print $2}' | tr -d '"')
echo "machine launch script started" > machine-launchstatus
chmod ugo+r machine-launchstatus
echo "machine launch script started" > "$MACHINE_LAUNCH_DIR/machine-launchstatus"
chmod ugo+r "$MACHINE_LAUNCH_DIR/machine-launchstatus"
# platform-specific setup
case "$OS_FLAVOR" in
@ -117,6 +119,8 @@ set -o pipefail
;;
centos)
;;
amzn)
;;
*)
echo "::ERROR:: Unknown OS flavor '$OS_FLAVOR'. Unable to do platform-specific setup."
exit 1
@ -287,8 +291,7 @@ set -o pipefail
# emergency fix for buildroot open files limit issue on centos:
echo "* hard nofile 16384" | sudo tee --append /etc/security/limits.conf
} 2>&1 | tee machine-launchstatus.log
chmod ugo+r machine-launchstatus.log
} 2>&1 | tee "$MACHINE_LAUNCH_DIR/machine-launchstatus.log"
chmod ugo+r "$MACHINE_LAUNCH_DIR/machine-launchstatus.log"
echo "machine launch script completed" >>machine-launchstatus
echo "machine launch script completed" >> "$MACHINE_LAUNCH_DIR/machine-launchstatus"