Merge pull request #1135 from firesim/fix-ci

CI Runner Cleanup
This commit is contained in:
Abraham Gonzalez 2022-08-10 11:48:43 -07:00 committed by GitHub
commit fc0a80d4e3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 82 additions and 23 deletions

View File

@ -38,4 +38,4 @@ ci_personal_api_token = os.environ['PERSONAL_ACCESS_TOKEN'] if not RUN_LOCAL els
ci_gha_api_url = os.environ['GITHUB_API_URL'] if not RUN_LOCAL else ""
# We look this up, instead of hardcoding "firesim/firesim", to support running
# this CI pipeline under forks.
ci_repo_name = os.environ['GITHUB_REPOSITORY'] if not RUN_LOCAL else ""
ci_repo_name = os.environ['GITHUB_REPOSITORY'] if not RUN_LOCAL else ""

View File

@ -1,10 +1,13 @@
import sys
import boto3
import os
import math
from fabric.api import *
import requests
from ci_variables import ci_firesim_dir, local_fsim_dir, ci_gha_api_url, ci_repo_name
from typing import Dict, List, Any
# Reuse manager utilities
# Note: ci_firesim_dir must not be used here because the persistent clone my not be initialized yet.
sys.path.append(local_fsim_dir + "/deploy")
@ -89,23 +92,44 @@ def instance_metadata_str(instance):
return static_md + dynamic_md
def deregister_runner_if_exists(gh_token, runner_name):
headers = {'Authorization': "token {}".format(gh_token.strip())}
def get_header(gh_token: str) -> Dict[str, str]:
return {"Authorization": f"token {gh_token.strip()}", "Accept": "application/vnd.github+json"}
# Check if exists before deregistering
r = requests.get(gha_runners_api_url, headers=headers)
def get_runners(gh_token: str) -> List:
r = requests.get(gha_runners_api_url, headers=get_header(gh_token))
if r.status_code != 200:
# if couldn't delete then just exit
return
raise Exception("Unable to retrieve count of GitHub Actions Runners")
res_dict = r.json()
runner_list = res_dict["runners"]
for runner in runner_list:
runner_count = res_dict["total_count"]
runners = []
for page_idx in range(math.ceil(runner_count / 30)):
r = requests.get(gha_runners_api_url, params={"per_page" : 30, "page" : page_idx + 1}, headers=get_header(gh_token))
if r.status_code != 200:
raise Exception("Unable to retrieve (sub)list of GitHub Actions Runners")
res_dict = r.json()
runners = runners + res_dict["runners"]
return runners
def delete_runner(gh_token: str, runner: Dict[str, Any]) -> bool:
r = requests.delete(f"""{gha_runners_api_url}/{runner["id"]}""", headers=get_header(gh_token))
if r.status_code != 204:
print(f"""Unable to delete runner {runner["name"]} with id: {runner["id"]}""")
return False
return True
def deregister_offline_runners(gh_token: str) -> None:
runners = get_runners(gh_token)
for runner in runners:
if runner["status"] == "offline":
delete_runner(gh_token, runner)
def deregister_runners(gh_token: str, runner_name: str) -> None:
runners = get_runners(gh_token)
for runner in runners:
if runner_name in runner["name"]:
r = requests.delete(f"""{gha_runners_api_url}/{runner["id"]}""", headers=headers)
if r.status_code != 204:
# if couldn't delete then just exit
return
delete_runner(gh_token, runner)
def change_workflow_instance_states(gh_token, tag_value, state_change, dryrun=False):
""" Change the state of all instances sharing the same CI workflow run's tag. """
@ -123,7 +147,7 @@ def change_workflow_instance_states(gh_token, tag_value, state_change, dryrun=Fa
client = boto3.client('ec2')
if state_change == 'stop':
print("Stopping instances: {}".format(", ".join(instance_ids)))
deregister_runner_if_exists(gh_token, tag_value)
deregister_runners(gh_token, tag_value)
client.stop_instances(InstanceIds=instance_ids, DryRun=dryrun)
elif state_change == 'start':
print("Starting instances: {}".format(", ".join(instance_ids)))
@ -140,7 +164,7 @@ def change_workflow_instance_states(gh_token, tag_value, state_change, dryrun=Fa
elif state_change == 'terminate':
print("Terminating instances: {}".format(", ".join(instance_ids)))
deregister_runner_if_exists(gh_token, tag_value)
deregister_runners(gh_token, tag_value)
client.terminate_instances(InstanceIds=instance_ids, DryRun=dryrun)
else:
raise ValueError("Unrecognized transition type: {}".format(state_change))

View File

@ -8,7 +8,7 @@ import pytz
import boto3
import sys
from common import unique_tag_key, deregister_runner_if_exists
from common import unique_tag_key, deregister_runners
# Reuse manager utilities
from ci_variables import ci_workdir, ci_personal_api_token, ci_workflow_run_id
@ -31,7 +31,7 @@ def main():
for inst in all_ci_instances:
lifetime_secs = (current_time - inst["LaunchTime"]).total_seconds()
if lifetime_secs > (INSTANCE_LIFETIME_LIMIT_HOURS * 3600):
deregister_runner_if_exists(ci_personal_api_token, ci_workflow_run_id)
deregister_runners(ci_personal_api_token, ci_workflow_run_id)
client.terminate_instances(InstanceIds=[inst["InstanceId"]])
print(" " + inst["InstanceId"])

16
.github/scripts/cull-old-ci-runners.py vendored Executable file
View File

@ -0,0 +1,16 @@
#!/usr/bin/env python3
# Runs periodically in it's own workflow in the CI/CD environment to teardown
# runners that are offline
from common import deregister_offline_runners
# Reuse manager utilities
from ci_variables import ci_personal_api_token
def main():
# deregister all offline runners
deregister_offline_runners(ci_personal_api_token)
if __name__ == "__main__":
main()

View File

@ -10,6 +10,7 @@ def initialize_repo():
""" Initializes firesim repo: clones, runs build-setup, and intializes marshal submodules """
with cd(manager_home_dir):
run("rm -rf {}".format(manager_fsim_dir))
# copy ci version of the repo into the new globally accessible location
run("git clone {} {}".format(ci_workdir, manager_fsim_dir))

View File

@ -37,8 +37,17 @@ def setup_self_hosted_runners():
print("Using Github Actions Runner v{}".format(RUNNER_VERSION))
# create NUM_RUNNER self-hosted runners on the manager that run in parallel
NUM_RUNNERS = 4
# verify no existing runners are running and remove unused runners
with settings(warn_only=True):
for runner_idx in range(NUM_RUNNERS):
run(f"screen -XS gh-a-runner-{runner_idx} quit")
deregister_runners(ci_personal_api_token, ci_workflow_run_id)
# spawn runners
for runner_idx in range(NUM_RUNNERS):
actions_dir = "{}/actions-runner-{}".format(manager_home_dir, runner_idx)
run("rm -rf {}".format(actions_dir))
run("mkdir -p {}".format(actions_dir))
with cd(actions_dir):
run("curl -o actions-runner-linux-x64-{}.tar.gz -L https://github.com/actions/runner/releases/download/v{}/actions-runner-linux-x64-{}.tar.gz".format(RUNNER_VERSION, RUNNER_VERSION, RUNNER_VERSION))
@ -48,10 +57,9 @@ def setup_self_hosted_runners():
run("sudo ./bin/installdependencies.sh")
# get registration token from API
headers = {'Authorization': "token {}".format(ci_personal_api_token.strip())}
r = requests.post(f"{gha_runners_api_url}/registration-token", headers=headers)
r = requests.post(f"{gha_runners_api_url}/registration-token", headers=get_header(ci_personal_api_token))
if r.status_code != 201:
raise Exception("HTTPS error: {} {}. Retrying.".format(r.status_code, r.json()))
raise Exception("HTTPS error: {} {}".format(r.status_code, r.json()))
res_dict = r.json()
reg_token = res_dict["token"]
@ -59,7 +67,7 @@ def setup_self_hosted_runners():
# config runner
put(".github/scripts/gh-a-runner.expect", actions_dir)
run("chmod +x gh-a-runner.expect")
runner_name = "{}-{}".format(ci_workflow_run_id, runner_idx) # used to teardown runner
runner_name = f"{ci_workflow_run_id}-{runner_idx}" # used to teardown runner
unique_label = ci_workflow_run_id # used within the yaml to choose a runner
run("./gh-a-runner.expect {} {} {}".format(reg_token, runner_name, unique_label))

View File

@ -1,4 +1,4 @@
name: firesim-cull-instances
name: firesim-cleanup
on:
schedule:
@ -23,3 +23,13 @@ jobs:
- uses: actions/checkout@v2
- uses: ./.github/actions/repo-setup-aws
- run: .github/scripts/cull-old-ci-instances.py
cull-old-ci-runners:
name: cull-old-ci-runners
runs-on: ubuntu-latest
env:
TERM: xterm-256-color
steps:
- uses: actions/checkout@v2
- uses: ./.github/actions/repo-setup-aws
- run: .github/scripts/cull-old-ci-runners.py