Silently support AL2 manager instances

This commit is contained in:
abejgonzalez 2023-03-08 11:01:09 -08:00
parent b61d7895bd
commit 76d24d15f1
7 changed files with 48 additions and 33 deletions

View File

@ -17,9 +17,9 @@ from ci_variables import ci_env
def wait_machine_launch_complete():
# Catch any exception that occurs so that we can gracefully teardown
with settings(warn_only=True):
rc = run("timeout 20m grep -q '.*machine launch script complete.*' <(tail -f /machine-launchstatus)").return_code
rc = run("timeout 20m grep -q '.*machine launch script complete.*' <(tail -f /tmp/machine-launchstatus)").return_code
if rc != 0:
run("cat /machine-launchstatus.log")
run("cat /tmp/machine-launchstatus.log")
raise Exception("machine-launch-script.sh failed to run")
def setup_self_hosted_runners(platform_lib: PlatformLib):

View File

@ -39,7 +39,15 @@ rootLogger = logging.getLogger()
# https://console.aws.amazon.com/ec2/v2/home?region=us-east-1#Images:visibility=public-images;search=FPGA%20Developer;sort=name
# And whenever this changes, you also need to update deploy/tests/test_amis.json
# by running scripts/update_test_amis.py
f1_ami_name = "FPGA Developer AMI - 1.12.1-40257ab5-6688-4c95-97d1-e251a40fd1fc"
def get_f1_ami_name() -> str:
cuser = local("whoami", capture=True)
if cuser == "centos":
return "FPGA Developer AMI - 1.12.1-40257ab5-6688-4c95-97d1-e251a40fd1fc"
elif cuser == "amzn":
return "FPGA Developer AMI(AL2) - 1.11.3-62ddb7b2-2f1e-4c38-a111-9093dcb1656f"
else:
assert False, "Unknown user given by 'whoami' (expected centos/amzn). Are you running on AWS EC2?"
return ""
class MockBoto3Instance:
""" This is used for testing without actually launching instances. """
@ -225,7 +233,7 @@ def get_f1_ami_id() -> str:
""" Get the AWS F1 Developer AMI by looking up the image name -- should be region independent.
"""
client = boto3.client('ec2')
response = client.describe_images(Filters=[{'Name': 'name', 'Values': [f1_ami_name]}])
response = client.describe_images(Filters=[{'Name': 'name', 'Values': [get_f1_ami_name()]}])
assert len(response['Images']) == 1
return response['Images'][0]['ImageId']

View File

@ -160,7 +160,8 @@ class RunFarm(metaclass=abc.ABCMeta):
def __init__(self, args: Dict[str, Any], metasimulation_enabled: bool) -> None:
self.args = args
self.metasimulation_enabled = metasimulation_enabled
self.default_simulation_dir = self.args.get("default_simulation_dir", "/home/centos")
cuser = local("whoami", capture=True)
self.default_simulation_dir = self.args.get("default_simulation_dir", f"/home/{cuser}")
self.SIM_HOST_HANDLE_TO_MAX_FPGA_SLOTS = dict()
self.SIM_HOST_HANDLE_TO_MAX_METASIM_SLOTS = dict()
self.SIM_HOST_HANDLE_TO_SWITCH_ONLY_OK = dict()

View File

@ -189,6 +189,9 @@ class InstanceDeployManager(metaclass=abc.ABCMeta):
self.uri_list = list()
self.uri_list.append(URIContainer('driver_tar', self.get_driver_tar_filename()))
def get_current_user(self) -> str:
return local("whoami", capture=True)
@abc.abstractmethod
def infrasetup_instance(self, uridir: str) -> None:
"""Run platform specific implementation of how to setup simulations.
@ -237,7 +240,7 @@ class InstanceDeployManager(metaclass=abc.ABCMeta):
### XXX Centos Specific
run('sudo yum -y install qemu-img')
# copy over kernel module
put('../build/nbd.ko', '/home/centos/nbd.ko', mirror_local_mode=True)
put('../build/nbd.ko', f'/home/{self.get_current_user()}/nbd.ko', mirror_local_mode=True)
def load_nbd_module(self) -> None:
""" If NBD is available and qcow2 support is required, load the nbd
@ -246,7 +249,7 @@ class InstanceDeployManager(metaclass=abc.ABCMeta):
if self.nbd_tracker is not None and self.parent_node.qcow2_support_required():
self.instance_logger("Loading NBD Kernel Module.")
self.unload_nbd_module()
run("""sudo insmod /home/centos/nbd.ko nbds_max={}""".format(self.nbd_tracker.NBDS_MAX))
run(f"""sudo insmod /home/{self.get_current_user()}/nbd.ko nbds_max={self.nbd_tracker.NBDS_MAX}""")
def unload_nbd_module(self) -> None:
""" If NBD is available and qcow2 support is required, unload the nbd
@ -637,7 +640,7 @@ class EC2InstanceDeployManager(InstanceDeployManager):
with warn_only():
run('git clone https://github.com/aws/aws-fpga')
run('cd aws-fpga && git checkout ' + aws_fpga_upstream_version)
with cd('/home/centos/aws-fpga'):
with cd(f'/home/{self.get_current_user()}/aws-fpga'):
run('source sdk_setup.sh')
def fpga_node_xdma(self) -> None:
@ -646,10 +649,10 @@ class EC2InstanceDeployManager(InstanceDeployManager):
"""
if self.instance_assigned_simulations():
self.instance_logger("""Copying AWS FPGA XDMA driver to remote node.""")
run('mkdir -p /home/centos/xdma/')
run(f'mkdir -p /home/{self.get_current_user()}/xdma/')
put('../platforms/f1/aws-fpga/sdk/linux_kernel_drivers',
'/home/centos/xdma/', mirror_local_mode=True)
with cd('/home/centos/xdma/linux_kernel_drivers/xdma/'), \
f'/home/{self.get_current_user()}/xdma/', mirror_local_mode=True)
with cd(f'/home/{self.get_current_user()}/xdma/linux_kernel_drivers/xdma/'), \
prefix("export PATH=/usr/bin:$PATH"):
# prefix only needed if conda env is earlier in PATH
# see build-setup-nolog.sh for explanation.
@ -738,7 +741,7 @@ class EC2InstanceDeployManager(InstanceDeployManager):
# now load xdma
self.instance_logger("Loading XDMA Driver Kernel Module.")
# TODO: can make these values automatically be chosen based on link lat
run("sudo insmod /home/centos/xdma/linux_kernel_drivers/xdma/xdma.ko poll_mode=1")
run(f"sudo insmod /home/{self.get_current_user()}/xdma/linux_kernel_drivers/xdma/xdma.ko poll_mode=1")
def start_ila_server(self) -> None:
""" start the vivado hw_server and virtual jtag on simulation instance. """

View File

@ -84,11 +84,11 @@ before, so we need to first ssh into the instance and make sure the
setup is complete.
In either case, ``ssh`` into your instance (e.g. ``ssh -i firesim.pem centos@YOUR_INSTANCE_IP``) and wait until the
``/machine-launchstatus`` file contains all the following text:
``/tmp/machine-launchstatus`` file contains all the following text:
::
$ cat /machine-launchstatus
$ cat /tmp/machine-launchstatus
machine launch script started
machine launch script completed

View File

@ -27,7 +27,7 @@ $SCRIPT_DIR/../../deploy/awstools/awstools.py \
rm -rf machine-launch-script.sh
# make sure managerinit finishes properly
run "timeout 10m grep -q \".*machine launch script complete.*\" <(tail -f machine-launchstatus)"
run "timeout 10m grep -q \".*machine launch script complete.*\" <(tail -f /tmp/machine-launchstatus)"
# setup the repo (similar to ci)

View File

@ -1,5 +1,7 @@
#!/bin/bash
MACHINE_LAUNCH_DIR=/tmp
CONDA_INSTALL_PREFIX=/opt/conda
CONDA_INSTALLER_VERSION=4.12.0-0
CONDA_INSTALLER="https://github.com/conda-forge/miniforge/releases/download/${CONDA_INSTALLER_VERSION}/Miniforge3-${CONDA_INSTALLER_VERSION}-Linux-x86_64.sh"
@ -108,8 +110,8 @@ set -o pipefail
OS_FLAVOR=$(grep '^ID=' /etc/os-release | awk -F= '{print $2}' | tr -d '"')
OS_VERSION=$(grep '^VERSION_ID=' /etc/os-release | awk -F= '{print $2}' | tr -d '"')
echo "machine launch script started" > machine-launchstatus
chmod ugo+r machine-launchstatus
echo "machine launch script started" > "$MACHINE_LAUNCH_DIR/machine-launchstatus"
chmod ugo+r "$MACHINE_LAUNCH_DIR/machine-launchstatus"
# platform-specific setup
case "$OS_FLAVOR" in
@ -117,6 +119,8 @@ set -o pipefail
;;
centos)
;;
amzn)
;;
*)
echo "::ERROR:: Unknown OS flavor '$OS_FLAVOR'. Unable to do platform-specific setup."
exit 1
@ -287,8 +291,7 @@ set -o pipefail
# emergency fix for buildroot open files limit issue on centos:
echo "* hard nofile 16384" | sudo tee --append /etc/security/limits.conf
} 2>&1 | tee machine-launchstatus.log
chmod ugo+r machine-launchstatus.log
} 2>&1 | tee "$MACHINE_LAUNCH_DIR/machine-launchstatus.log"
chmod ugo+r "$MACHINE_LAUNCH_DIR/machine-launchstatus.log"
echo "machine launch script completed" >>machine-launchstatus
echo "machine launch script completed" >> "$MACHINE_LAUNCH_DIR/machine-launchstatus"