Terminate based on cluster tag | Fix small issues
This commit is contained in:
parent
c1e1c9e14a
commit
27b0c5e0da
|
@ -40,11 +40,11 @@ def cull_aws_instances(current_time: DateTime) -> None:
|
|||
# Grab all instances with a CI-generated tag
|
||||
aws_platform_lib = get_platform_lib(Platform.AWS)
|
||||
all_ci_instances = aws_platform_lib.find_all_ci_instances()
|
||||
select_ci_instances = aws_platform_lib.find_select_ci_instances()
|
||||
run_farm_ci_instances = aws_platform_lib.find_run_farm_ci_instances()
|
||||
|
||||
client = boto3.client('ec2')
|
||||
|
||||
instances_to_terminate = find_timed_out_resources(FPGA_INSTANCE_LIFETIME_LIMIT_HOURS, current_time, map(lambda x: (x, x['LaunchTime']), select_ci_instances))
|
||||
instances_to_terminate = find_timed_out_resources(FPGA_INSTANCE_LIFETIME_LIMIT_HOURS, current_time, map(lambda x: (x, x['LaunchTime']), run_farm_ci_instances))
|
||||
instances_to_terminate += find_timed_out_resources(INSTANCE_LIFETIME_LIMIT_HOURS, current_time, map(lambda x: (x, x['LaunchTime']), all_ci_instances))
|
||||
instances_to_terminate = list(set(instances_to_terminate))
|
||||
|
||||
|
@ -60,10 +60,10 @@ def cull_aws_instances(current_time: DateTime) -> None:
|
|||
def cull_azure_resources(current_time: DateTime) -> None:
|
||||
azure_platform_lib = get_platform_lib(Platform.AZURE)
|
||||
all_azure_ci_vms = azure_platform_lib.find_all_ci_instances()
|
||||
select_azure_ci_vms = azure_platform_lib.find_select_ci_instances()
|
||||
run_farm_azure_ci_vms = azure_platform_lib.find_run_farm_ci_instances()
|
||||
|
||||
vms_to_terminate = find_timed_out_resources(FPGA_INSTANCE_LIFETIME_LIMIT_HOURS, current_time, \
|
||||
map(lambda x: (x, datetime.datetime.strptime(x['LaunchTime'],'%Y-%m-%d %H:%M:%S.%f%z')), select_azure_ci_vms))
|
||||
map(lambda x: (x, datetime.datetime.strptime(x['LaunchTime'],'%Y-%m-%d %H:%M:%S.%f%z')), run_farm_azure_ci_vms))
|
||||
vms_to_terminate += find_timed_out_resources(INSTANCE_LIFETIME_LIMIT_HOURS, current_time, \
|
||||
map(lambda x: (x, datetime.datetime.strptime(x['LaunchTime'],'%Y-%m-%d %H:%M:%S.%f%z')), all_azure_ci_vms))
|
||||
vms_to_terminate = list(set(vms_to_terminate))
|
||||
|
|
|
@ -79,17 +79,20 @@ class PlatformLib(metaclass=abc.ABCMeta):
|
|||
|
||||
@abc.abstractmethod
|
||||
def find_all_workflow_instances(self, workflow_tag: str) -> List:
|
||||
""" Returns all instances in this workflow (including manager) """
|
||||
""" Returns all manager instances in this workflow """
|
||||
raise NotImplementedError
|
||||
|
||||
@abc.abstractmethod
|
||||
def find_all_ci_instances(self) -> List:
|
||||
""" Returns all instances across CI workflows """
|
||||
""" Returns all manager instances across all CI workflows """
|
||||
raise NotImplementedError
|
||||
|
||||
@abc.abstractmethod
|
||||
def find_select_ci_instances(self, workflow_tag: str = '*') -> List:
|
||||
""" Grabs a list of select instances across all CI using the CI unique tag key"""
|
||||
def find_run_farm_ci_instances(self, workflow_tag: str = '*') -> List:
|
||||
"""
|
||||
Returns all run farm instance types (normally FPGA instances) that have the
|
||||
`workflow_tag` in the cluster name.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abc.abstractmethod
|
||||
|
@ -130,8 +133,8 @@ class PlatformLib(metaclass=abc.ABCMeta):
|
|||
return f"centos@{self.get_manager_ip(workflow_tag)}"
|
||||
|
||||
@abc.abstractmethod
|
||||
def check_and_terminate_select_instances(self, timeout: int, workflow_tag: str) -> None:
|
||||
""" Check if platform-specific instances are running past a `timeout` minutes designated time. If so, then terminate them. """
|
||||
def check_and_terminate_run_farm_instances(self, timeout: int, workflow_tag: str) -> None:
|
||||
""" Check if run farm instances are running past a `timeout` minutes designated time. If so, then terminate them. """
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
|
@ -182,11 +185,11 @@ class AWSPlatformLib(PlatformLib):
|
|||
all_ci_instances = get_instances_with_filter([all_ci_instances_filter], allowed_states=['*'])
|
||||
return all_ci_instances
|
||||
|
||||
def find_select_ci_instances(self, workflow_tag: str = '*') -> List:
|
||||
""" Grabs a list of select instances across all CI using the CI unique tag key"""
|
||||
def find_run_farm_ci_instances(self, workflow_tag: str = '*') -> List:
|
||||
# on AWS run farm instances are marked with 'fsimcluster'
|
||||
instances_filter = [
|
||||
self.get_filter(workflow_tag),
|
||||
{'Name': 'instance-type', 'Values': ['f1.2xlarge', 'f1.16xlarge']},
|
||||
{'Name': 'tag:fsimcluster', 'Values': f'*{workflow_tag}*'},
|
||||
{'Name': 'instance-type', 'Values': ['f1.2xlarge', 'f1.4xlarge', 'f1.16xlarge']},
|
||||
]
|
||||
ci_instances = get_instances_with_filter(instances_filter, allowed_states=['*'])
|
||||
return ci_instances
|
||||
|
@ -245,7 +248,7 @@ class AWSPlatformLib(PlatformLib):
|
|||
else:
|
||||
raise ValueError(f"Unrecognized transition type: {state_change}")
|
||||
|
||||
def get_platform_enum(self) -> None:
|
||||
def get_platform_enum(self) -> Platform:
|
||||
return Platform.AWS
|
||||
|
||||
def get_manager_metadata_string(self, workflow_tag: str) -> str:
|
||||
|
@ -264,18 +267,21 @@ class AWSPlatformLib(PlatformLib):
|
|||
|
||||
return static_md + dynamic_md
|
||||
|
||||
def check_and_terminate_select_instances(self, timeout: int, workflow_tag: str) -> None:
|
||||
# terminate f1.{2,16}xlarge instances after timeout minutes of running (extra backup)
|
||||
instances = self.find_select_ci_instances(workflow_tag)
|
||||
def check_and_terminate_run_farm_instances(self, timeout: int, workflow_tag: str) -> None:
|
||||
# We need this in case terminate is called in setup-self-hosted-workflow before aws-configure is run
|
||||
if self.client is None:
|
||||
self.client = boto3.client('ec2')
|
||||
|
||||
instances = self.find_run_farm_ci_instances(workflow_tag)
|
||||
terminated_insts = False
|
||||
for inst in instances:
|
||||
if (datetime.datetime.now() - inst.launch_time) >= datetime.timedelta(minutes=timeout):
|
||||
print("Uncaught FPGA instance shutdown detected")
|
||||
print("Uncaught run farm instance shutdown detected")
|
||||
|
||||
instids = [ inst.instance_id ]
|
||||
terminate_instances(instids, False)
|
||||
self.client.terminate_instances(InstanceIds=instids, DryRun=False)
|
||||
|
||||
print(f"Terminated FPGA instance {instids}")
|
||||
print(f"Terminated run farm instance {instids}")
|
||||
terminated_insts = True
|
||||
|
||||
# post comment after instances are terminated just in case there is an issue with posting
|
||||
|
@ -428,8 +434,8 @@ class AzurePlatformLib(PlatformLib):
|
|||
else:
|
||||
print(f"Succeeded in deleting VM {vm['name']}")
|
||||
|
||||
def check_and_terminate_select_instances(self, timeout: int, workflow_tag: str) -> None:
|
||||
def check_and_terminate_run_farm_instances(self, timeout: int, workflow_tag: str) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
def find_select_ci_instances(self, workflow_tag: str = '*') -> List:
|
||||
def find_run_farm_ci_instances(self, workflow_tag: str = '*') -> List:
|
||||
raise NotImplementedError
|
||||
|
|
|
@ -35,7 +35,7 @@ TERMINATE_STATES = ["cancelled", "success", "skipped", "stale", "failure", "time
|
|||
STOP_STATES = []
|
||||
NOP_STATES = ["action_required"] # TODO: unsure when this happens
|
||||
|
||||
def wrap_in_code(wrap: str):
|
||||
def wrap_in_code(wrap: str) -> str:
|
||||
return f"\n```\n{wrap}\n```"
|
||||
|
||||
def main(platform: Platform):
|
||||
|
@ -59,13 +59,16 @@ def main(platform: Platform):
|
|||
print(f"Workflow {ci_env['GITHUB_RUN_ID']} status: {state_status} {state_concl}")
|
||||
|
||||
# check that select instances are terminated on time
|
||||
platform_lib.check_and_terminate_select_instances(45, ci_env['GITHUB_RUN_ID'])
|
||||
platform_lib.check_and_terminate_run_farm_instances(45, ci_env['GITHUB_RUN_ID'])
|
||||
|
||||
if state_status in ['completed']:
|
||||
if state_concl in TERMINATE_STATES:
|
||||
platform_lib.check_and_terminate_run_farm_instances(0, ci_env['GITHUB_RUN_ID'])
|
||||
platform_lib.terminate_instances(ci_env['PERSONAL_ACCESS_TOKEN'], ci_env['GITHUB_RUN_ID'])
|
||||
return
|
||||
elif state_concl in STOP_STATES:
|
||||
# if we stop then we should terminate the run farm instances
|
||||
platform_lib.check_and_terminate_run_farm_instances(0, ci_env['GITHUB_RUN_ID'])
|
||||
platform_lib.stop_instances(ci_env['PERSONAL_ACCESS_TOKEN'], ci_env['GITHUB_RUN_ID'])
|
||||
return
|
||||
elif state_concl not in NOP_STATES:
|
||||
|
@ -86,7 +89,7 @@ def main(platform: Platform):
|
|||
|
||||
issue_post(ci_env['PERSONAL_ACCESS_TOKEN'], post_str)
|
||||
|
||||
platform_lib.check_and_terminate_select_instances(0, ci_env['GITHUB_RUN_ID'])
|
||||
platform_lib.check_and_terminate_run_farm_instances(0, ci_env['GITHUB_RUN_ID'])
|
||||
platform_lib.terminate_instances(ci_env['PERSONAL_ACCESS_TOKEN'], ci_env['GITHUB_RUN_ID'])
|
||||
|
||||
post_str = f"Instances for CI run {ci_env['GITHUB_RUN_ID']} were supposedly terminated. Verify termination manually.\n"
|
||||
|
|
Loading…
Reference in New Issue