Silently support AL2 manager instances

2023-03-08 11:01:09 -08:00 · 2023-03-08 11:01:09 -08:00 · 76d24d15f1
parent b61d7895bd
commit 76d24d15f1
7 changed files with 48 additions and 33 deletions
--- a/.github/scripts/setup-manager-self-hosted.py
+++ b/.github/scripts/setup-manager-self-hosted.py
@ -17,9 +17,9 @@ from ci_variables import ci_env
 def wait_machine_launch_complete():
    # Catch any exception that occurs so that we can gracefully teardown
    with settings(warn_only=True):
-        rc = run("timeout 20m grep -q '.*machine launch script complete.*' <(tail -f /machine-launchstatus)").return_code
+        rc = run("timeout 20m grep -q '.*machine launch script complete.*' <(tail -f /tmp/machine-launchstatus)").return_code
        if rc != 0:
-            run("cat /machine-launchstatus.log")
+            run("cat /tmp/machine-launchstatus.log")
            raise Exception("machine-launch-script.sh failed to run")

 def setup_self_hosted_runners(platform_lib: PlatformLib):
--- a/deploy/awstools/awstools.py
+++ b/deploy/awstools/awstools.py
@ -39,7 +39,15 @@ rootLogger = logging.getLogger()
 # https://console.aws.amazon.com/ec2/v2/home?region=us-east-1#Images:visibility=public-images;search=FPGA%20Developer;sort=name
 # And whenever this changes, you also need to update deploy/tests/test_amis.json
 # by running scripts/update_test_amis.py
-f1_ami_name = "FPGA Developer AMI - 1.12.1-40257ab5-6688-4c95-97d1-e251a40fd1fc"
+def get_f1_ami_name() -> str:
+    cuser = local("whoami", capture=True)
+    if cuser == "centos":
+        return "FPGA Developer AMI - 1.12.1-40257ab5-6688-4c95-97d1-e251a40fd1fc"
+    elif cuser == "amzn":
+        return "FPGA Developer AMI(AL2) - 1.11.3-62ddb7b2-2f1e-4c38-a111-9093dcb1656f"
+    else:
+        assert False, "Unknown user given by 'whoami' (expected centos/amzn). Are you running on AWS EC2?"
+        return ""

 class MockBoto3Instance:
    """ This is used for testing without actually launching instances. """
@ -225,7 +233,7 @@ def get_f1_ami_id() -> str:
    """ Get the AWS F1 Developer AMI by looking up the image name -- should be region independent.
    """
    client = boto3.client('ec2')
-    response = client.describe_images(Filters=[{'Name': 'name', 'Values': [f1_ami_name]}])
+    response = client.describe_images(Filters=[{'Name': 'name', 'Values': [get_f1_ami_name()]}])
    assert len(response['Images']) == 1
    return response['Images'][0]['ImageId']

--- a/deploy/runtools/run_farm.py
+++ b/deploy/runtools/run_farm.py
@ -160,7 +160,8 @@ class RunFarm(metaclass=abc.ABCMeta):
    def __init__(self, args: Dict[str, Any], metasimulation_enabled: bool) -> None:
        self.args = args
        self.metasimulation_enabled = metasimulation_enabled
-        self.default_simulation_dir = self.args.get("default_simulation_dir", "/home/centos")
+        cuser = local("whoami", capture=True)
+        self.default_simulation_dir = self.args.get("default_simulation_dir", f"/home/{cuser}")
        self.SIM_HOST_HANDLE_TO_MAX_FPGA_SLOTS = dict()
        self.SIM_HOST_HANDLE_TO_MAX_METASIM_SLOTS = dict()
        self.SIM_HOST_HANDLE_TO_SWITCH_ONLY_OK = dict()
--- a/deploy/runtools/run_farm_deploy_managers.py
+++ b/deploy/runtools/run_farm_deploy_managers.py
@ -189,6 +189,9 @@ class InstanceDeployManager(metaclass=abc.ABCMeta):
        self.uri_list = list()
        self.uri_list.append(URIContainer('driver_tar', self.get_driver_tar_filename()))

+    def get_current_user(self) -> str:
+        return local("whoami", capture=True)
+
    @abc.abstractmethod
    def infrasetup_instance(self, uridir: str) -> None:
        """Run platform specific implementation of how to setup simulations.
@ -237,7 +240,7 @@ class InstanceDeployManager(metaclass=abc.ABCMeta):
            ### XXX Centos Specific
            run('sudo yum -y install qemu-img')
            # copy over kernel module
-            put('../build/nbd.ko', '/home/centos/nbd.ko', mirror_local_mode=True)
+            put('../build/nbd.ko', f'/home/{self.get_current_user()}/nbd.ko', mirror_local_mode=True)

    def load_nbd_module(self) -> None:
        """ If NBD is available and qcow2 support is required, load the nbd
@ -246,7 +249,7 @@ class InstanceDeployManager(metaclass=abc.ABCMeta):
        if self.nbd_tracker is not None and self.parent_node.qcow2_support_required():
            self.instance_logger("Loading NBD Kernel Module.")
            self.unload_nbd_module()
-            run("""sudo insmod /home/centos/nbd.ko nbds_max={}""".format(self.nbd_tracker.NBDS_MAX))
+            run(f"""sudo insmod /home/{self.get_current_user()}/nbd.ko nbds_max={self.nbd_tracker.NBDS_MAX}""")

    def unload_nbd_module(self) -> None:
        """ If NBD is available and qcow2 support is required, unload the nbd
@ -637,7 +640,7 @@ class EC2InstanceDeployManager(InstanceDeployManager):
            with warn_only():
                run('git clone https://github.com/aws/aws-fpga')
                run('cd aws-fpga && git checkout ' + aws_fpga_upstream_version)
-            with cd('/home/centos/aws-fpga'):
+            with cd(f'/home/{self.get_current_user()}/aws-fpga'):
                run('source sdk_setup.sh')

    def fpga_node_xdma(self) -> None:
@ -646,10 +649,10 @@ class EC2InstanceDeployManager(InstanceDeployManager):
        """
        if self.instance_assigned_simulations():
            self.instance_logger("""Copying AWS FPGA XDMA driver to remote node.""")
-            run('mkdir -p /home/centos/xdma/')
+            run(f'mkdir -p /home/{self.get_current_user()}/xdma/')
            put('../platforms/f1/aws-fpga/sdk/linux_kernel_drivers',
-                '/home/centos/xdma/', mirror_local_mode=True)
-            with cd('/home/centos/xdma/linux_kernel_drivers/xdma/'), \
+                f'/home/{self.get_current_user()}/xdma/', mirror_local_mode=True)
+            with cd(f'/home/{self.get_current_user()}/xdma/linux_kernel_drivers/xdma/'), \
                prefix("export PATH=/usr/bin:$PATH"):
                # prefix only needed if conda env is earlier in PATH
                # see build-setup-nolog.sh for explanation.
@ -738,7 +741,7 @@ class EC2InstanceDeployManager(InstanceDeployManager):
            # now load xdma
            self.instance_logger("Loading XDMA Driver Kernel Module.")
            # TODO: can make these values automatically be chosen based on link lat
-            run("sudo insmod /home/centos/xdma/linux_kernel_drivers/xdma/xdma.ko poll_mode=1")
+            run(f"sudo insmod /home/{self.get_current_user()}/xdma/linux_kernel_drivers/xdma/xdma.ko poll_mode=1")

    def start_ila_server(self) -> None:
        """ start the vivado hw_server and virtual jtag on simulation instance. """
--- a/docs/Initial-Setup/Setting-up-your-Manager-Instance.rst
+++ b/docs/Initial-Setup/Setting-up-your-Manager-Instance.rst
@ -84,11 +84,11 @@ before, so we need to first ssh into the instance and make sure the
 setup is complete.

 In either case, ``ssh`` into your instance (e.g. ``ssh -i firesim.pem centos@YOUR_INSTANCE_IP``) and wait until the
-``/machine-launchstatus`` file contains all the following text:
+``/tmp/machine-launchstatus`` file contains all the following text:

 ::

-    $ cat /machine-launchstatus
+    $ cat /tmp/machine-launchstatus
    machine launch script started
    machine launch script completed

--- a/regression/aws-ec2-tests/launch-setup-manager-instance.sh
+++ b/regression/aws-ec2-tests/launch-setup-manager-instance.sh
@ -27,7 +27,7 @@ $SCRIPT_DIR/../../deploy/awstools/awstools.py \
 rm -rf machine-launch-script.sh

 # make sure managerinit finishes properly
-run "timeout 10m grep -q \".*machine launch script complete.*\" <(tail -f machine-launchstatus)"
+run "timeout 10m grep -q \".*machine launch script complete.*\" <(tail -f /tmp/machine-launchstatus)"

 # setup the repo (similar to ci)

--- a/scripts/machine-launch-script.sh
+++ b/scripts/machine-launch-script.sh
@ -1,5 +1,7 @@
 #!/bin/bash

+MACHINE_LAUNCH_DIR=/tmp
+
 CONDA_INSTALL_PREFIX=/opt/conda
 CONDA_INSTALLER_VERSION=4.12.0-0
 CONDA_INSTALLER="https://github.com/conda-forge/miniforge/releases/download/${CONDA_INSTALLER_VERSION}/Miniforge3-${CONDA_INSTALLER_VERSION}-Linux-x86_64.sh"
@ -108,8 +110,8 @@ set -o pipefail
    OS_FLAVOR=$(grep '^ID=' /etc/os-release | awk -F= '{print $2}' | tr -d '"')
    OS_VERSION=$(grep '^VERSION_ID=' /etc/os-release | awk -F= '{print $2}' | tr -d '"')

-    echo "machine launch script started" > machine-launchstatus
-    chmod ugo+r machine-launchstatus
+    echo "machine launch script started" > "$MACHINE_LAUNCH_DIR/machine-launchstatus"
+    chmod ugo+r "$MACHINE_LAUNCH_DIR/machine-launchstatus"

    # platform-specific setup
    case "$OS_FLAVOR" in
@ -117,6 +119,8 @@ set -o pipefail
            ;;
        centos)
            ;;
+        amzn)
+            ;;
        *)
            echo "::ERROR:: Unknown OS flavor '$OS_FLAVOR'. Unable to do platform-specific setup."
            exit 1
@ -287,8 +291,7 @@ set -o pipefail
    # emergency fix for buildroot open files limit issue on centos:
    echo "* hard nofile 16384" | sudo tee --append /etc/security/limits.conf

-} 2>&1 | tee machine-launchstatus.log
-chmod ugo+r machine-launchstatus.log
+} 2>&1 | tee "$MACHINE_LAUNCH_DIR/machine-launchstatus.log"
+chmod ugo+r "$MACHINE_LAUNCH_DIR/machine-launchstatus.log"

-
-echo "machine launch script completed" >>machine-launchstatus
+echo "machine launch script completed" >> "$MACHINE_LAUNCH_DIR/machine-launchstatus"