commit
fc0a80d4e3
|
@ -38,4 +38,4 @@ ci_personal_api_token = os.environ['PERSONAL_ACCESS_TOKEN'] if not RUN_LOCAL els
|
|||
ci_gha_api_url = os.environ['GITHUB_API_URL'] if not RUN_LOCAL else ""
|
||||
# We look this up, instead of hardcoding "firesim/firesim", to support running
|
||||
# this CI pipeline under forks.
|
||||
ci_repo_name = os.environ['GITHUB_REPOSITORY'] if not RUN_LOCAL else ""
|
||||
ci_repo_name = os.environ['GITHUB_REPOSITORY'] if not RUN_LOCAL else ""
|
||||
|
|
|
@ -1,10 +1,13 @@
|
|||
import sys
|
||||
import boto3
|
||||
import os
|
||||
import math
|
||||
from fabric.api import *
|
||||
import requests
|
||||
from ci_variables import ci_firesim_dir, local_fsim_dir, ci_gha_api_url, ci_repo_name
|
||||
|
||||
from typing import Dict, List, Any
|
||||
|
||||
# Reuse manager utilities
|
||||
# Note: ci_firesim_dir must not be used here because the persistent clone my not be initialized yet.
|
||||
sys.path.append(local_fsim_dir + "/deploy")
|
||||
|
@ -89,23 +92,44 @@ def instance_metadata_str(instance):
|
|||
|
||||
return static_md + dynamic_md
|
||||
|
||||
def deregister_runner_if_exists(gh_token, runner_name):
|
||||
headers = {'Authorization': "token {}".format(gh_token.strip())}
|
||||
def get_header(gh_token: str) -> Dict[str, str]:
|
||||
return {"Authorization": f"token {gh_token.strip()}", "Accept": "application/vnd.github+json"}
|
||||
|
||||
# Check if exists before deregistering
|
||||
r = requests.get(gha_runners_api_url, headers=headers)
|
||||
def get_runners(gh_token: str) -> List:
|
||||
r = requests.get(gha_runners_api_url, headers=get_header(gh_token))
|
||||
if r.status_code != 200:
|
||||
# if couldn't delete then just exit
|
||||
return
|
||||
|
||||
raise Exception("Unable to retrieve count of GitHub Actions Runners")
|
||||
res_dict = r.json()
|
||||
runner_list = res_dict["runners"]
|
||||
for runner in runner_list:
|
||||
runner_count = res_dict["total_count"]
|
||||
|
||||
runners = []
|
||||
for page_idx in range(math.ceil(runner_count / 30)):
|
||||
r = requests.get(gha_runners_api_url, params={"per_page" : 30, "page" : page_idx + 1}, headers=get_header(gh_token))
|
||||
if r.status_code != 200:
|
||||
raise Exception("Unable to retrieve (sub)list of GitHub Actions Runners")
|
||||
res_dict = r.json()
|
||||
runners = runners + res_dict["runners"]
|
||||
|
||||
return runners
|
||||
|
||||
def delete_runner(gh_token: str, runner: Dict[str, Any]) -> bool:
|
||||
r = requests.delete(f"""{gha_runners_api_url}/{runner["id"]}""", headers=get_header(gh_token))
|
||||
if r.status_code != 204:
|
||||
print(f"""Unable to delete runner {runner["name"]} with id: {runner["id"]}""")
|
||||
return False
|
||||
return True
|
||||
|
||||
def deregister_offline_runners(gh_token: str) -> None:
|
||||
runners = get_runners(gh_token)
|
||||
for runner in runners:
|
||||
if runner["status"] == "offline":
|
||||
delete_runner(gh_token, runner)
|
||||
|
||||
def deregister_runners(gh_token: str, runner_name: str) -> None:
|
||||
runners = get_runners(gh_token)
|
||||
for runner in runners:
|
||||
if runner_name in runner["name"]:
|
||||
r = requests.delete(f"""{gha_runners_api_url}/{runner["id"]}""", headers=headers)
|
||||
if r.status_code != 204:
|
||||
# if couldn't delete then just exit
|
||||
return
|
||||
delete_runner(gh_token, runner)
|
||||
|
||||
def change_workflow_instance_states(gh_token, tag_value, state_change, dryrun=False):
|
||||
""" Change the state of all instances sharing the same CI workflow run's tag. """
|
||||
|
@ -123,7 +147,7 @@ def change_workflow_instance_states(gh_token, tag_value, state_change, dryrun=Fa
|
|||
client = boto3.client('ec2')
|
||||
if state_change == 'stop':
|
||||
print("Stopping instances: {}".format(", ".join(instance_ids)))
|
||||
deregister_runner_if_exists(gh_token, tag_value)
|
||||
deregister_runners(gh_token, tag_value)
|
||||
client.stop_instances(InstanceIds=instance_ids, DryRun=dryrun)
|
||||
elif state_change == 'start':
|
||||
print("Starting instances: {}".format(", ".join(instance_ids)))
|
||||
|
@ -140,7 +164,7 @@ def change_workflow_instance_states(gh_token, tag_value, state_change, dryrun=Fa
|
|||
|
||||
elif state_change == 'terminate':
|
||||
print("Terminating instances: {}".format(", ".join(instance_ids)))
|
||||
deregister_runner_if_exists(gh_token, tag_value)
|
||||
deregister_runners(gh_token, tag_value)
|
||||
client.terminate_instances(InstanceIds=instance_ids, DryRun=dryrun)
|
||||
else:
|
||||
raise ValueError("Unrecognized transition type: {}".format(state_change))
|
||||
|
|
|
@ -8,7 +8,7 @@ import pytz
|
|||
import boto3
|
||||
import sys
|
||||
|
||||
from common import unique_tag_key, deregister_runner_if_exists
|
||||
from common import unique_tag_key, deregister_runners
|
||||
|
||||
# Reuse manager utilities
|
||||
from ci_variables import ci_workdir, ci_personal_api_token, ci_workflow_run_id
|
||||
|
@ -31,7 +31,7 @@ def main():
|
|||
for inst in all_ci_instances:
|
||||
lifetime_secs = (current_time - inst["LaunchTime"]).total_seconds()
|
||||
if lifetime_secs > (INSTANCE_LIFETIME_LIMIT_HOURS * 3600):
|
||||
deregister_runner_if_exists(ci_personal_api_token, ci_workflow_run_id)
|
||||
deregister_runners(ci_personal_api_token, ci_workflow_run_id)
|
||||
client.terminate_instances(InstanceIds=[inst["InstanceId"]])
|
||||
print(" " + inst["InstanceId"])
|
||||
|
||||
|
|
|
@ -0,0 +1,16 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
# Runs periodically in it's own workflow in the CI/CD environment to teardown
|
||||
# runners that are offline
|
||||
|
||||
from common import deregister_offline_runners
|
||||
|
||||
# Reuse manager utilities
|
||||
from ci_variables import ci_personal_api_token
|
||||
|
||||
def main():
|
||||
# deregister all offline runners
|
||||
deregister_offline_runners(ci_personal_api_token)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -10,6 +10,7 @@ def initialize_repo():
|
|||
""" Initializes firesim repo: clones, runs build-setup, and intializes marshal submodules """
|
||||
|
||||
with cd(manager_home_dir):
|
||||
run("rm -rf {}".format(manager_fsim_dir))
|
||||
# copy ci version of the repo into the new globally accessible location
|
||||
run("git clone {} {}".format(ci_workdir, manager_fsim_dir))
|
||||
|
||||
|
|
|
@ -37,8 +37,17 @@ def setup_self_hosted_runners():
|
|||
print("Using Github Actions Runner v{}".format(RUNNER_VERSION))
|
||||
# create NUM_RUNNER self-hosted runners on the manager that run in parallel
|
||||
NUM_RUNNERS = 4
|
||||
|
||||
# verify no existing runners are running and remove unused runners
|
||||
with settings(warn_only=True):
|
||||
for runner_idx in range(NUM_RUNNERS):
|
||||
run(f"screen -XS gh-a-runner-{runner_idx} quit")
|
||||
deregister_runners(ci_personal_api_token, ci_workflow_run_id)
|
||||
|
||||
# spawn runners
|
||||
for runner_idx in range(NUM_RUNNERS):
|
||||
actions_dir = "{}/actions-runner-{}".format(manager_home_dir, runner_idx)
|
||||
run("rm -rf {}".format(actions_dir))
|
||||
run("mkdir -p {}".format(actions_dir))
|
||||
with cd(actions_dir):
|
||||
run("curl -o actions-runner-linux-x64-{}.tar.gz -L https://github.com/actions/runner/releases/download/v{}/actions-runner-linux-x64-{}.tar.gz".format(RUNNER_VERSION, RUNNER_VERSION, RUNNER_VERSION))
|
||||
|
@ -48,10 +57,9 @@ def setup_self_hosted_runners():
|
|||
run("sudo ./bin/installdependencies.sh")
|
||||
|
||||
# get registration token from API
|
||||
headers = {'Authorization': "token {}".format(ci_personal_api_token.strip())}
|
||||
r = requests.post(f"{gha_runners_api_url}/registration-token", headers=headers)
|
||||
r = requests.post(f"{gha_runners_api_url}/registration-token", headers=get_header(ci_personal_api_token))
|
||||
if r.status_code != 201:
|
||||
raise Exception("HTTPS error: {} {}. Retrying.".format(r.status_code, r.json()))
|
||||
raise Exception("HTTPS error: {} {}".format(r.status_code, r.json()))
|
||||
|
||||
res_dict = r.json()
|
||||
reg_token = res_dict["token"]
|
||||
|
@ -59,7 +67,7 @@ def setup_self_hosted_runners():
|
|||
# config runner
|
||||
put(".github/scripts/gh-a-runner.expect", actions_dir)
|
||||
run("chmod +x gh-a-runner.expect")
|
||||
runner_name = "{}-{}".format(ci_workflow_run_id, runner_idx) # used to teardown runner
|
||||
runner_name = f"{ci_workflow_run_id}-{runner_idx}" # used to teardown runner
|
||||
unique_label = ci_workflow_run_id # used within the yaml to choose a runner
|
||||
run("./gh-a-runner.expect {} {} {}".format(reg_token, runner_name, unique_label))
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
name: firesim-cull-instances
|
||||
name: firesim-cleanup
|
||||
|
||||
on:
|
||||
schedule:
|
||||
|
@ -23,3 +23,13 @@ jobs:
|
|||
- uses: actions/checkout@v2
|
||||
- uses: ./.github/actions/repo-setup-aws
|
||||
- run: .github/scripts/cull-old-ci-instances.py
|
||||
|
||||
cull-old-ci-runners:
|
||||
name: cull-old-ci-runners
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
TERM: xterm-256-color
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: ./.github/actions/repo-setup-aws
|
||||
- run: .github/scripts/cull-old-ci-runners.py
|
Loading…
Reference in New Issue