Merge pull request #1460 from firesim/al2
Support AL2 manager instances
This commit is contained in:
commit
26bb89d5f5
|
@ -17,9 +17,9 @@ from ci_variables import ci_env
|
|||
def wait_machine_launch_complete():
|
||||
# Catch any exception that occurs so that we can gracefully teardown
|
||||
with settings(warn_only=True):
|
||||
rc = run("timeout 20m grep -q '.*machine launch script complete.*' <(tail -f /machine-launchstatus)").return_code
|
||||
rc = run("timeout 20m grep -q '.*machine launch script complete.*' <(tail -f /tmp/machine-launchstatus)").return_code
|
||||
if rc != 0:
|
||||
run("cat /machine-launchstatus.log")
|
||||
run("cat /tmp/machine-launchstatus.log")
|
||||
raise Exception("machine-launch-script.sh failed to run")
|
||||
|
||||
def setup_self_hosted_runners(platform_lib: PlatformLib):
|
||||
|
|
|
@ -39,7 +39,16 @@ rootLogger = logging.getLogger()
|
|||
# https://console.aws.amazon.com/ec2/v2/home?region=us-east-1#Images:visibility=public-images;search=FPGA%20Developer;sort=name
|
||||
# And whenever this changes, you also need to update deploy/tests/test_amis.json
|
||||
# by running scripts/update_test_amis.py
|
||||
f1_ami_name = "FPGA Developer AMI - 1.12.1-40257ab5-6688-4c95-97d1-e251a40fd1fc"
|
||||
# additionally, for normal use this assumes that the AMI used by the runhosts and manager instance match.
|
||||
# in the case of CI (or launching instances from a non-EC2 instance), this defaults to the centos based AMI.
|
||||
def get_f1_ami_name() -> str:
|
||||
cuser = os.environ["USER"]
|
||||
if cuser == "amzn":
|
||||
return "FPGA Developer AMI(AL2) - 1.11.3-62ddb7b2-2f1e-4c38-a111-9093dcb1656f"
|
||||
else:
|
||||
if cuser != "centos":
|
||||
print("Unknown $USER (expected centos/amzn). Defaulting to the Centos AWS EC2 AMI.")
|
||||
return "FPGA Developer AMI - 1.12.1-40257ab5-6688-4c95-97d1-e251a40fd1fc"
|
||||
|
||||
class MockBoto3Instance:
|
||||
""" This is used for testing without actually launching instances. """
|
||||
|
@ -227,7 +236,7 @@ def get_f1_ami_id() -> str:
|
|||
""" Get the AWS F1 Developer AMI by looking up the image name -- should be region independent.
|
||||
"""
|
||||
client = boto3.client('ec2')
|
||||
response = client.describe_images(Filters=[{'Name': 'name', 'Values': [f1_ami_name]}])
|
||||
response = client.describe_images(Filters=[{'Name': 'name', 'Values': [get_f1_ami_name()]}])
|
||||
assert len(response['Images']) == 1
|
||||
return response['Images'][0]['ImageId']
|
||||
|
||||
|
|
|
@ -160,7 +160,7 @@ class RunFarm(metaclass=abc.ABCMeta):
|
|||
def __init__(self, args: Dict[str, Any], metasimulation_enabled: bool) -> None:
|
||||
self.args = args
|
||||
self.metasimulation_enabled = metasimulation_enabled
|
||||
self.default_simulation_dir = self.args.get("default_simulation_dir", "/home/centos")
|
||||
self.default_simulation_dir = self.args.get("default_simulation_dir", f"/home/{os.environ['USER']}")
|
||||
self.SIM_HOST_HANDLE_TO_MAX_FPGA_SLOTS = dict()
|
||||
self.SIM_HOST_HANDLE_TO_MAX_METASIM_SLOTS = dict()
|
||||
self.SIM_HOST_HANDLE_TO_SWITCH_ONLY_OK = dict()
|
||||
|
|
|
@ -12,6 +12,7 @@ import time
|
|||
from os.path import join as pjoin
|
||||
from os.path import basename, expanduser
|
||||
from os import PathLike, fspath
|
||||
import os
|
||||
from fsspec.core import url_to_fs # type: ignore
|
||||
from pathlib import Path
|
||||
import hashlib
|
||||
|
@ -80,14 +81,14 @@ class URIContainer:
|
|||
return m.hexdigest()
|
||||
|
||||
def _resolve_vanilla_path(self, hwcfg) -> Optional[str]:
|
||||
""" Allows fallback to a vanilla path. Relative paths are resolved realtive to firesim/deploy/.
|
||||
""" Allows fallback to a vanilla path. Relative paths are resolved realtive to firesim/deploy/.
|
||||
This will convert a vanilla path to a URI, or return None."""
|
||||
uri: Optional[str] = getattr(hwcfg, self.hwcfg_prop)
|
||||
|
||||
# do nothing if there isn't a URI
|
||||
if uri is None:
|
||||
return None
|
||||
|
||||
|
||||
# if already a URI, exit early returning unmodified string
|
||||
is_uri = re.match(_RFC_3986_PATTERN, uri)
|
||||
if is_uri:
|
||||
|
@ -116,7 +117,7 @@ class URIContainer:
|
|||
|
||||
# choose a repeatable, path based on the hash of the URI
|
||||
destination = pjoin(local_dir, self.hashed_name(uri))
|
||||
|
||||
|
||||
return (uri, destination)
|
||||
|
||||
def local_pre_download(self, local_dir: str, hwcfg) -> Optional[Tuple[str, str]]:
|
||||
|
@ -126,11 +127,11 @@ class URIContainer:
|
|||
|
||||
# resolve the URI and the path '/{dir}/{hash}' we should download to
|
||||
both = self._choose_path(local_dir, hwcfg)
|
||||
|
||||
|
||||
# do nothing if there isn't a URI
|
||||
if both is None:
|
||||
return None
|
||||
|
||||
|
||||
(uri, destination) = both
|
||||
|
||||
# When it exists, return the same information, but skip the download
|
||||
|
@ -152,11 +153,11 @@ class URIContainer:
|
|||
|
||||
# resolve the URI and the path '/{dir}/{hash}' we should download to
|
||||
both = self._choose_path(local_dir, hwcfg)
|
||||
|
||||
|
||||
# do nothing if there isn't a URI
|
||||
if both is None:
|
||||
return None
|
||||
|
||||
|
||||
(uri, destination) = both
|
||||
|
||||
# because the local file has a nonsense name (the hash)
|
||||
|
@ -237,7 +238,7 @@ class InstanceDeployManager(metaclass=abc.ABCMeta):
|
|||
### XXX Centos Specific
|
||||
run('sudo yum -y install qemu-img')
|
||||
# copy over kernel module
|
||||
put('../build/nbd.ko', '/home/centos/nbd.ko', mirror_local_mode=True)
|
||||
put('../build/nbd.ko', f"/home/{os.environ['USER']}/nbd.ko", mirror_local_mode=True)
|
||||
|
||||
def load_nbd_module(self) -> None:
|
||||
""" If NBD is available and qcow2 support is required, load the nbd
|
||||
|
@ -246,7 +247,7 @@ class InstanceDeployManager(metaclass=abc.ABCMeta):
|
|||
if self.nbd_tracker is not None and self.parent_node.qcow2_support_required():
|
||||
self.instance_logger("Loading NBD Kernel Module.")
|
||||
self.unload_nbd_module()
|
||||
run("""sudo insmod /home/centos/nbd.ko nbds_max={}""".format(self.nbd_tracker.NBDS_MAX))
|
||||
run(f"""sudo insmod /home/{os.environ['USER']}/nbd.ko nbds_max={self.nbd_tracker.NBDS_MAX}""")
|
||||
|
||||
def unload_nbd_module(self) -> None:
|
||||
""" If NBD is available and qcow2 support is required, unload the nbd
|
||||
|
@ -278,7 +279,7 @@ class InstanceDeployManager(metaclass=abc.ABCMeta):
|
|||
""" Returns the path on the remote for a given slot number. """
|
||||
remote_home_dir = self.parent_node.get_sim_dir()
|
||||
remote_sim_dir = f"{remote_home_dir}/sim_slot_{slotno}/"
|
||||
|
||||
|
||||
# so that callers can reliably concatenate folders to the returned value
|
||||
assert remote_sim_dir[-1] == '/', f"Return value of get_remote_sim_dir_for_slot({slotno}) must end with '/'."
|
||||
|
||||
|
@ -294,12 +295,12 @@ class InstanceDeployManager(metaclass=abc.ABCMeta):
|
|||
hwcfg = self.parent_node.sim_slots[slotno].get_resolved_server_hardware_config()
|
||||
for container in self.uri_list:
|
||||
container.local_pre_download(dir, hwcfg)
|
||||
|
||||
|
||||
def get_local_uri_paths(self, slotno: int, dir: str) -> list[Tuple[str, str]]:
|
||||
""" Get all paths of local URIs that were previously downloaded. """
|
||||
|
||||
hwcfg = self.parent_node.sim_slots[slotno].get_resolved_server_hardware_config()
|
||||
|
||||
|
||||
ret = list()
|
||||
for container in self.uri_list:
|
||||
maybe_file = container.get_rsync_path(dir, hwcfg)
|
||||
|
@ -637,7 +638,7 @@ class EC2InstanceDeployManager(InstanceDeployManager):
|
|||
with warn_only():
|
||||
run('git clone https://github.com/aws/aws-fpga')
|
||||
run('cd aws-fpga && git checkout ' + aws_fpga_upstream_version)
|
||||
with cd('/home/centos/aws-fpga'):
|
||||
with cd(f"/home/{os.environ['USER']}/aws-fpga"):
|
||||
run('source sdk_setup.sh')
|
||||
|
||||
def fpga_node_xdma(self) -> None:
|
||||
|
@ -646,10 +647,10 @@ class EC2InstanceDeployManager(InstanceDeployManager):
|
|||
"""
|
||||
if self.instance_assigned_simulations():
|
||||
self.instance_logger("""Copying AWS FPGA XDMA driver to remote node.""")
|
||||
run('mkdir -p /home/centos/xdma/')
|
||||
run(f"mkdir -p /home/{os.environ['USER']}/xdma/")
|
||||
put('../platforms/f1/aws-fpga/sdk/linux_kernel_drivers',
|
||||
'/home/centos/xdma/', mirror_local_mode=True)
|
||||
with cd('/home/centos/xdma/linux_kernel_drivers/xdma/'), \
|
||||
f"/home/{os.environ['USER']}/xdma/", mirror_local_mode=True)
|
||||
with cd(f"/home/{os.environ['USER']}/xdma/linux_kernel_drivers/xdma/"), \
|
||||
prefix("export PATH=/usr/bin:$PATH"):
|
||||
# prefix only needed if conda env is earlier in PATH
|
||||
# see build-setup-nolog.sh for explanation.
|
||||
|
@ -738,7 +739,7 @@ class EC2InstanceDeployManager(InstanceDeployManager):
|
|||
# now load xdma
|
||||
self.instance_logger("Loading XDMA Driver Kernel Module.")
|
||||
# TODO: can make these values automatically be chosen based on link lat
|
||||
run("sudo insmod /home/centos/xdma/linux_kernel_drivers/xdma/xdma.ko poll_mode=1")
|
||||
run(f"sudo insmod /home/{os.environ['USER']}/xdma/linux_kernel_drivers/xdma/xdma.ko poll_mode=1")
|
||||
|
||||
def start_ila_server(self) -> None:
|
||||
""" start the vivado hw_server and virtual jtag on simulation instance. """
|
||||
|
@ -807,12 +808,12 @@ class EC2InstanceDeployManager(InstanceDeployManager):
|
|||
|
||||
class VitisInstanceDeployManager(InstanceDeployManager):
|
||||
""" This class manages a Vitis-enabled instance """
|
||||
|
||||
|
||||
@classmethod
|
||||
def sim_command_requires_sudo(cls) -> bool:
|
||||
""" This sim does not require sudo. """
|
||||
return False
|
||||
|
||||
|
||||
@classmethod
|
||||
def get_xclbin_filename(cls) -> str:
|
||||
""" Get the name of the xclbin inside the sim_slot_X directory on the run host. """
|
||||
|
|
|
@ -84,11 +84,11 @@ before, so we need to first ssh into the instance and make sure the
|
|||
setup is complete.
|
||||
|
||||
In either case, ``ssh`` into your instance (e.g. ``ssh -i firesim.pem centos@YOUR_INSTANCE_IP``) and wait until the
|
||||
``/machine-launchstatus`` file contains all the following text:
|
||||
``/tmp/machine-launchstatus`` file contains all the following text:
|
||||
|
||||
::
|
||||
|
||||
$ cat /machine-launchstatus
|
||||
$ cat /tmp/machine-launchstatus
|
||||
machine launch script started
|
||||
machine launch script completed
|
||||
|
||||
|
|
|
@ -27,7 +27,7 @@ $SCRIPT_DIR/../../deploy/awstools/awstools.py \
|
|||
rm -rf machine-launch-script.sh
|
||||
|
||||
# make sure managerinit finishes properly
|
||||
run "timeout 10m grep -q \".*machine launch script complete.*\" <(tail -f machine-launchstatus)"
|
||||
run "timeout 10m grep -q \".*machine launch script complete.*\" <(tail -f /tmp/machine-launchstatus)"
|
||||
|
||||
# setup the repo (similar to ci)
|
||||
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
#!/bin/bash
|
||||
|
||||
MACHINE_LAUNCH_DIR=/tmp
|
||||
|
||||
CONDA_INSTALL_PREFIX=/opt/conda
|
||||
CONDA_INSTALLER_VERSION=4.12.0-0
|
||||
CONDA_INSTALLER="https://github.com/conda-forge/miniforge/releases/download/${CONDA_INSTALLER_VERSION}/Miniforge3-${CONDA_INSTALLER_VERSION}-Linux-x86_64.sh"
|
||||
|
@ -108,8 +110,8 @@ set -o pipefail
|
|||
OS_FLAVOR=$(grep '^ID=' /etc/os-release | awk -F= '{print $2}' | tr -d '"')
|
||||
OS_VERSION=$(grep '^VERSION_ID=' /etc/os-release | awk -F= '{print $2}' | tr -d '"')
|
||||
|
||||
echo "machine launch script started" > machine-launchstatus
|
||||
chmod ugo+r machine-launchstatus
|
||||
echo "machine launch script started" > "$MACHINE_LAUNCH_DIR/machine-launchstatus"
|
||||
chmod ugo+r "$MACHINE_LAUNCH_DIR/machine-launchstatus"
|
||||
|
||||
# platform-specific setup
|
||||
case "$OS_FLAVOR" in
|
||||
|
@ -117,6 +119,8 @@ set -o pipefail
|
|||
;;
|
||||
centos)
|
||||
;;
|
||||
amzn)
|
||||
;;
|
||||
*)
|
||||
echo "::ERROR:: Unknown OS flavor '$OS_FLAVOR'. Unable to do platform-specific setup."
|
||||
exit 1
|
||||
|
@ -287,8 +291,7 @@ set -o pipefail
|
|||
# emergency fix for buildroot open files limit issue on centos:
|
||||
echo "* hard nofile 16384" | sudo tee --append /etc/security/limits.conf
|
||||
|
||||
} 2>&1 | tee machine-launchstatus.log
|
||||
chmod ugo+r machine-launchstatus.log
|
||||
} 2>&1 | tee "$MACHINE_LAUNCH_DIR/machine-launchstatus.log"
|
||||
chmod ugo+r "$MACHINE_LAUNCH_DIR/machine-launchstatus.log"
|
||||
|
||||
|
||||
echo "machine launch script completed" >>machine-launchstatus
|
||||
echo "machine launch script completed" >> "$MACHINE_LAUNCH_DIR/machine-launchstatus"
|
||||
|
|
Loading…
Reference in New Issue