From 400dbb03f3f1792d8233feb16bc89272d24b9133 Mon Sep 17 00:00:00 2001 From: Abraham Gonzalez Date: Tue, 9 Aug 2022 22:07:33 +0000 Subject: [PATCH 1/4] Unique runners for workflow run | Delete offline runners --- .github/scripts/ci_variables.py | 1 + .github/scripts/common.py | 45 +++++++++++++------- .github/scripts/setup-manager-self-hosted.py | 17 ++++++-- 3 files changed, 44 insertions(+), 19 deletions(-) diff --git a/.github/scripts/ci_variables.py b/.github/scripts/ci_variables.py index 3c983583..80fbb17b 100644 --- a/.github/scripts/ci_variables.py +++ b/.github/scripts/ci_variables.py @@ -15,6 +15,7 @@ local_fsim_dir = os.path.normpath((os.path.realpath(__file__)) + "/../../..") # This is used as a unique tag for all instances launched in a workflow ci_workflow_run_id = os.environ['GITHUB_RUN_ID'] if not RUN_LOCAL else 0 +ci_workflow_run_retries = os.environ['GITHUB_RUN_ATTEMPT'] if not RUN_LOCAL else 0 ci_commit_sha1 = os.environ['GITHUB_SHA'] if not RUN_LOCAL else 0 # Multiple clones of the FireSim repository exists on manager. We expect state diff --git a/.github/scripts/common.py b/.github/scripts/common.py index 650d751e..e021c295 100644 --- a/.github/scripts/common.py +++ b/.github/scripts/common.py @@ -5,6 +5,8 @@ from fabric.api import * import requests from ci_variables import ci_firesim_dir, local_fsim_dir, ci_gha_api_url, ci_repo_name +from typing import Dict, List, Any + # Reuse manager utilities # Note: ci_firesim_dir must not be used here because the persistent clone my not be initialized yet. sys.path.append(local_fsim_dir + "/deploy") @@ -89,23 +91,34 @@ def instance_metadata_str(instance): return static_md + dynamic_md -def deregister_runner_if_exists(gh_token, runner_name): - headers = {'Authorization': "token {}".format(gh_token.strip())} +def get_header(gh_token: str) -> Dict[str, str]: + return {"Authorization": f"token {gh_token.strip()}", "Accept": "application/vnd.github+json"} - # Check if exists before deregistering - r = requests.get(gha_runners_api_url, headers=headers) +def get_runners(gh_token: str) -> List: + r = requests.get(gha_runners_api_url, headers=get_header(gh_token)) if r.status_code != 200: - # if couldn't delete then just exit - return - + raise Exception("Unable to retrieve list of GitHub Actions Runners") res_dict = r.json() - runner_list = res_dict["runners"] - for runner in runner_list: + return res_dict["runners"] + +def delete_runner(gh_token: str, runner: Dict[str, Any]) -> bool: + r = requests.delete("https://api.github.com/repos/firesim/firesim/actions/runners/{}".format(runner["id"]), headers=get_header(gh_token)) + if r.status_code != 204: + print(f"""Unable to delete runner {runner["name"]} with id: {runner["id"]}""") + return False + return True + +def deregister_offline_runners(gh_token: str) -> None: + runners = get_runners(gh_token) + for runner in runners: + if runner["status"] == "offline": + delete_runner(gh_token, runner) + +def deregister_runners(gh_token: str, runner_name: str) -> None: + runners = get_runners(gh_token) + for runner in runners: if runner_name in runner["name"]: - r = requests.delete(f"""{gha_runners_api_url}/{runner["id"]}""", headers=headers) - if r.status_code != 204: - # if couldn't delete then just exit - return + delete_runner(gh_token, runner) def change_workflow_instance_states(gh_token, tag_value, state_change, dryrun=False): """ Change the state of all instances sharing the same CI workflow run's tag. """ @@ -123,7 +136,8 @@ def change_workflow_instance_states(gh_token, tag_value, state_change, dryrun=Fa client = boto3.client('ec2') if state_change == 'stop': print("Stopping instances: {}".format(", ".join(instance_ids))) - deregister_runner_if_exists(gh_token, tag_value) + deregister_offline_runners(gh_token) + deregister_runners(gh_token, tag_value) client.stop_instances(InstanceIds=instance_ids, DryRun=dryrun) elif state_change == 'start': print("Starting instances: {}".format(", ".join(instance_ids))) @@ -140,7 +154,8 @@ def change_workflow_instance_states(gh_token, tag_value, state_change, dryrun=Fa elif state_change == 'terminate': print("Terminating instances: {}".format(", ".join(instance_ids))) - deregister_runner_if_exists(gh_token, tag_value) + deregister_offline_runners(gh_token) + deregister_runners(gh_token, tag_value) client.terminate_instances(InstanceIds=instance_ids, DryRun=dryrun) else: raise ValueError("Unrecognized transition type: {}".format(state_change)) diff --git a/.github/scripts/setup-manager-self-hosted.py b/.github/scripts/setup-manager-self-hosted.py index bd4869e0..6bc87db4 100755 --- a/.github/scripts/setup-manager-self-hosted.py +++ b/.github/scripts/setup-manager-self-hosted.py @@ -37,8 +37,18 @@ def setup_self_hosted_runners(): print("Using Github Actions Runner v{}".format(RUNNER_VERSION)) # create NUM_RUNNER self-hosted runners on the manager that run in parallel NUM_RUNNERS = 4 + + # verify no existing runners are running and remove unused runners + with settings(warn_only=True): + for runner_idx in range(NUM_RUNNERS): + run(f"screen -XS gh-a-runner-{runner_idx} quit") + deregister_offline_runners(ci_personal_api_token) + deregister_runners(ci_personal_api_token, ci_workflow_run_id) + + # spawn runners for runner_idx in range(NUM_RUNNERS): actions_dir = "{}/actions-runner-{}".format(manager_home_dir, runner_idx) + run("rm -rf {}".format(actions_dir)) run("mkdir -p {}".format(actions_dir)) with cd(actions_dir): run("curl -o actions-runner-linux-x64-{}.tar.gz -L https://github.com/actions/runner/releases/download/v{}/actions-runner-linux-x64-{}.tar.gz".format(RUNNER_VERSION, RUNNER_VERSION, RUNNER_VERSION)) @@ -48,10 +58,9 @@ def setup_self_hosted_runners(): run("sudo ./bin/installdependencies.sh") # get registration token from API - headers = {'Authorization': "token {}".format(ci_personal_api_token.strip())} - r = requests.post(f"{gha_runners_api_url}/registration-token", headers=headers) + r = requests.post(f"{gha_runners_api_url}/registration-token", headers=get_header(ci_personal_api_token)) if r.status_code != 201: - raise Exception("HTTPS error: {} {}. Retrying.".format(r.status_code, r.json())) + raise Exception("HTTPS error: {} {}".format(r.status_code, r.json())) res_dict = r.json() reg_token = res_dict["token"] @@ -59,7 +68,7 @@ def setup_self_hosted_runners(): # config runner put(".github/scripts/gh-a-runner.expect", actions_dir) run("chmod +x gh-a-runner.expect") - runner_name = "{}-{}".format(ci_workflow_run_id, runner_idx) # used to teardown runner + runner_name = f"{ci_workflow_run_id}-{ci_workflow_run_retries}-{runner_idx}" # used to teardown runner unique_label = ci_workflow_run_id # used within the yaml to choose a runner run("./gh-a-runner.expect {} {} {}".format(reg_token, runner_name, unique_label)) From 5ba61feb737e8398231a04e04c7192c50d8d4838 Mon Sep 17 00:00:00 2001 From: Abraham Gonzalez Date: Tue, 9 Aug 2022 22:16:01 +0000 Subject: [PATCH 2/4] Move cull to global periodic workflow --- .github/scripts/common.py | 2 -- .github/scripts/cull-old-ci-instances.py | 4 ++-- .github/scripts/cull-old-ci-runners.py | 16 ++++++++++++++++ .github/scripts/setup-manager-self-hosted.py | 1 - ...im-cull-instances.yml => firesim-cleanup.yml} | 12 +++++++++++- 5 files changed, 29 insertions(+), 6 deletions(-) create mode 100755 .github/scripts/cull-old-ci-runners.py rename .github/workflows/{firesim-cull-instances.yml => firesim-cleanup.yml} (70%) diff --git a/.github/scripts/common.py b/.github/scripts/common.py index e021c295..b6c9b8e5 100644 --- a/.github/scripts/common.py +++ b/.github/scripts/common.py @@ -136,7 +136,6 @@ def change_workflow_instance_states(gh_token, tag_value, state_change, dryrun=Fa client = boto3.client('ec2') if state_change == 'stop': print("Stopping instances: {}".format(", ".join(instance_ids))) - deregister_offline_runners(gh_token) deregister_runners(gh_token, tag_value) client.stop_instances(InstanceIds=instance_ids, DryRun=dryrun) elif state_change == 'start': @@ -154,7 +153,6 @@ def change_workflow_instance_states(gh_token, tag_value, state_change, dryrun=Fa elif state_change == 'terminate': print("Terminating instances: {}".format(", ".join(instance_ids))) - deregister_offline_runners(gh_token) deregister_runners(gh_token, tag_value) client.terminate_instances(InstanceIds=instance_ids, DryRun=dryrun) else: diff --git a/.github/scripts/cull-old-ci-instances.py b/.github/scripts/cull-old-ci-instances.py index 7c49dbc9..3649c568 100755 --- a/.github/scripts/cull-old-ci-instances.py +++ b/.github/scripts/cull-old-ci-instances.py @@ -8,7 +8,7 @@ import pytz import boto3 import sys -from common import unique_tag_key, deregister_runner_if_exists +from common import unique_tag_key, deregister_runners # Reuse manager utilities from ci_variables import ci_workdir, ci_personal_api_token, ci_workflow_run_id @@ -31,7 +31,7 @@ def main(): for inst in all_ci_instances: lifetime_secs = (current_time - inst["LaunchTime"]).total_seconds() if lifetime_secs > (INSTANCE_LIFETIME_LIMIT_HOURS * 3600): - deregister_runner_if_exists(ci_personal_api_token, ci_workflow_run_id) + deregister_runners(ci_personal_api_token, ci_workflow_run_id) client.terminate_instances(InstanceIds=[inst["InstanceId"]]) print(" " + inst["InstanceId"]) diff --git a/.github/scripts/cull-old-ci-runners.py b/.github/scripts/cull-old-ci-runners.py new file mode 100755 index 00000000..c8c8cd17 --- /dev/null +++ b/.github/scripts/cull-old-ci-runners.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python3 + +# Runs periodically in it's own workflow in the CI/CD environment to teardown +# runners that are offline + +from common import deregister_offline_runners + +# Reuse manager utilities +from ci_variables import ci_personal_api_token + +def main(): + # deregister all offline runners + deregister_offline_runners(ci_personal_api_token) + +if __name__ == "__main__": + main() diff --git a/.github/scripts/setup-manager-self-hosted.py b/.github/scripts/setup-manager-self-hosted.py index 6bc87db4..2ee2dc14 100755 --- a/.github/scripts/setup-manager-self-hosted.py +++ b/.github/scripts/setup-manager-self-hosted.py @@ -42,7 +42,6 @@ def setup_self_hosted_runners(): with settings(warn_only=True): for runner_idx in range(NUM_RUNNERS): run(f"screen -XS gh-a-runner-{runner_idx} quit") - deregister_offline_runners(ci_personal_api_token) deregister_runners(ci_personal_api_token, ci_workflow_run_id) # spawn runners diff --git a/.github/workflows/firesim-cull-instances.yml b/.github/workflows/firesim-cleanup.yml similarity index 70% rename from .github/workflows/firesim-cull-instances.yml rename to .github/workflows/firesim-cleanup.yml index 9ba4e6ba..0e34d4ec 100644 --- a/.github/workflows/firesim-cull-instances.yml +++ b/.github/workflows/firesim-cleanup.yml @@ -1,4 +1,4 @@ -name: firesim-cull-instances +name: firesim-cleanup on: schedule: @@ -23,3 +23,13 @@ jobs: - uses: actions/checkout@v2 - uses: ./.github/actions/repo-setup-aws - run: .github/scripts/cull-old-ci-instances.py + + cull-old-ci-runners: + name: cull-old-ci-runners + runs-on: ubuntu-latest + env: + TERM: xterm-256-color + steps: + - uses: actions/checkout@v2 + - uses: ./.github/actions/repo-setup-aws + - run: .github/scripts/cull-old-ci-runners.py From 7ab50caf2bf9b51d5d10c3905ae63d4f0d5a260c Mon Sep 17 00:00:00 2001 From: Abraham Gonzalez Date: Tue, 9 Aug 2022 22:20:44 +0000 Subject: [PATCH 3/4] Use API URL variable --- .github/scripts/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/common.py b/.github/scripts/common.py index b6c9b8e5..373cf05e 100644 --- a/.github/scripts/common.py +++ b/.github/scripts/common.py @@ -102,7 +102,7 @@ def get_runners(gh_token: str) -> List: return res_dict["runners"] def delete_runner(gh_token: str, runner: Dict[str, Any]) -> bool: - r = requests.delete("https://api.github.com/repos/firesim/firesim/actions/runners/{}".format(runner["id"]), headers=get_header(gh_token)) + r = requests.delete(f"""{gha_runners_api_url}/{runner["id"]}""", headers=get_header(gh_token)) if r.status_code != 204: print(f"""Unable to delete runner {runner["name"]} with id: {runner["id"]}""") return False From 61caa0d7c91526e0975bfbccc7264f2a921ee178 Mon Sep 17 00:00:00 2001 From: Abraham Gonzalez Date: Tue, 9 Aug 2022 22:33:43 +0000 Subject: [PATCH 4/4] Remove repo on fresh re-run | Remove retry amt | Get all runners --- .github/scripts/ci_variables.py | 3 +-- .github/scripts/common.py | 15 +++++++++++++-- .github/scripts/initialize-repo.py | 1 + .github/scripts/setup-manager-self-hosted.py | 2 +- 4 files changed, 16 insertions(+), 5 deletions(-) diff --git a/.github/scripts/ci_variables.py b/.github/scripts/ci_variables.py index 80fbb17b..aa68bf34 100644 --- a/.github/scripts/ci_variables.py +++ b/.github/scripts/ci_variables.py @@ -15,7 +15,6 @@ local_fsim_dir = os.path.normpath((os.path.realpath(__file__)) + "/../../..") # This is used as a unique tag for all instances launched in a workflow ci_workflow_run_id = os.environ['GITHUB_RUN_ID'] if not RUN_LOCAL else 0 -ci_workflow_run_retries = os.environ['GITHUB_RUN_ATTEMPT'] if not RUN_LOCAL else 0 ci_commit_sha1 = os.environ['GITHUB_SHA'] if not RUN_LOCAL else 0 # Multiple clones of the FireSim repository exists on manager. We expect state @@ -39,4 +38,4 @@ ci_personal_api_token = os.environ['PERSONAL_ACCESS_TOKEN'] if not RUN_LOCAL els ci_gha_api_url = os.environ['GITHUB_API_URL'] if not RUN_LOCAL else "" # We look this up, instead of hardcoding "firesim/firesim", to support running # this CI pipeline under forks. -ci_repo_name = os.environ['GITHUB_REPOSITORY'] if not RUN_LOCAL else "" \ No newline at end of file +ci_repo_name = os.environ['GITHUB_REPOSITORY'] if not RUN_LOCAL else "" diff --git a/.github/scripts/common.py b/.github/scripts/common.py index 373cf05e..77cfeaad 100644 --- a/.github/scripts/common.py +++ b/.github/scripts/common.py @@ -1,6 +1,7 @@ import sys import boto3 import os +import math from fabric.api import * import requests from ci_variables import ci_firesim_dir, local_fsim_dir, ci_gha_api_url, ci_repo_name @@ -97,9 +98,19 @@ def get_header(gh_token: str) -> Dict[str, str]: def get_runners(gh_token: str) -> List: r = requests.get(gha_runners_api_url, headers=get_header(gh_token)) if r.status_code != 200: - raise Exception("Unable to retrieve list of GitHub Actions Runners") + raise Exception("Unable to retrieve count of GitHub Actions Runners") res_dict = r.json() - return res_dict["runners"] + runner_count = res_dict["total_count"] + + runners = [] + for page_idx in range(math.ceil(runner_count / 30)): + r = requests.get(gha_runners_api_url, params={"per_page" : 30, "page" : page_idx + 1}, headers=get_header(gh_token)) + if r.status_code != 200: + raise Exception("Unable to retrieve (sub)list of GitHub Actions Runners") + res_dict = r.json() + runners = runners + res_dict["runners"] + + return runners def delete_runner(gh_token: str, runner: Dict[str, Any]) -> bool: r = requests.delete(f"""{gha_runners_api_url}/{runner["id"]}""", headers=get_header(gh_token)) diff --git a/.github/scripts/initialize-repo.py b/.github/scripts/initialize-repo.py index 75368201..e7efa8b3 100755 --- a/.github/scripts/initialize-repo.py +++ b/.github/scripts/initialize-repo.py @@ -10,6 +10,7 @@ def initialize_repo(): """ Initializes firesim repo: clones, runs build-setup, and intializes marshal submodules """ with cd(manager_home_dir): + run("rm -rf {}".format(manager_fsim_dir)) # copy ci version of the repo into the new globally accessible location run("git clone {} {}".format(ci_workdir, manager_fsim_dir)) diff --git a/.github/scripts/setup-manager-self-hosted.py b/.github/scripts/setup-manager-self-hosted.py index 2ee2dc14..bb3c067c 100755 --- a/.github/scripts/setup-manager-self-hosted.py +++ b/.github/scripts/setup-manager-self-hosted.py @@ -67,7 +67,7 @@ def setup_self_hosted_runners(): # config runner put(".github/scripts/gh-a-runner.expect", actions_dir) run("chmod +x gh-a-runner.expect") - runner_name = f"{ci_workflow_run_id}-{ci_workflow_run_retries}-{runner_idx}" # used to teardown runner + runner_name = f"{ci_workflow_run_id}-{runner_idx}" # used to teardown runner unique_label = ci_workflow_run_id # used within the yaml to choose a runner run("./gh-a-runner.expect {} {} {}".format(reg_token, runner_name, unique_label))