Terminate outside inst. deploy manager | Fix monitor loop
This commit is contained in:
parent
98fa2f9b25
commit
964ab8ca0b
|
@ -521,13 +521,18 @@ class FireSimTopologyWithPasses:
|
||||||
self.boot_simulation_passes(False, skip_instance_binding=True)
|
self.boot_simulation_passes(False, skip_instance_binding=True)
|
||||||
|
|
||||||
@parallel
|
@parallel
|
||||||
def monitor_jobs_wrapper(run_farm, completed_jobs: List[str], teardown: bool, terminateoncompletion: bool, job_results_dir: str) -> Dict[str, Dict[str, bool]]:
|
def monitor_jobs_wrapper(
|
||||||
|
run_farm: RunFarm,
|
||||||
|
prior_completed_jobs: List[str],
|
||||||
|
is_final_loop: bool,
|
||||||
|
is_networked: bool,
|
||||||
|
terminateoncompletion: bool,
|
||||||
|
job_results_dir: str) -> Dict[str, Dict[str, bool]]:
|
||||||
""" on each instance, check over its switches and simulations
|
""" on each instance, check over its switches and simulations
|
||||||
to copy results off. """
|
to copy results off. """
|
||||||
my_node = run_farm.lookup_by_host(env.host_string)
|
my_node = run_farm.lookup_by_host(env.host_string)
|
||||||
assert my_node.instance_deploy_manager is not None
|
assert my_node.instance_deploy_manager is not None
|
||||||
return my_node.instance_deploy_manager.monitor_jobs_instance(completed_jobs, teardown, terminateoncompletion, job_results_dir)
|
return my_node.instance_deploy_manager.monitor_jobs_instance(prior_completed_jobs, is_final_loop, is_networked, terminateoncompletion, job_results_dir)
|
||||||
|
|
||||||
|
|
||||||
def loop_logger(instancestates: Dict[str, Any], terminateoncompletion: bool) -> None:
|
def loop_logger(instancestates: Dict[str, Any], terminateoncompletion: bool) -> None:
|
||||||
""" Print the simulation status nicely. """
|
""" Print the simulation status nicely. """
|
||||||
|
@ -601,8 +606,8 @@ class FireSimTopologyWithPasses:
|
||||||
rootLogger.info("""{}/{} simulations are still running.""".format(runningsims, totalsims))
|
rootLogger.info("""{}/{} simulations are still running.""".format(runningsims, totalsims))
|
||||||
rootLogger.info("-"*80)
|
rootLogger.info("-"*80)
|
||||||
|
|
||||||
# teardown is required if roots are switches
|
# is networked if a switch node is the root
|
||||||
teardown_required = isinstance(self.firesimtopol.roots[0], FireSimSwitchNode)
|
is_networked = isinstance(self.firesimtopol.roots[0], FireSimSwitchNode)
|
||||||
|
|
||||||
# run polling loop
|
# run polling loop
|
||||||
while True:
|
while True:
|
||||||
|
@ -612,24 +617,21 @@ class FireSimTopologyWithPasses:
|
||||||
def get_jobs_completed_local_info():
|
def get_jobs_completed_local_info():
|
||||||
# this is a list of jobs completed, since any completed job will have
|
# this is a list of jobs completed, since any completed job will have
|
||||||
# a directory within this directory.
|
# a directory within this directory.
|
||||||
jobscompleted = os.listdir(self.workload.job_monitoring_dir)
|
monitored_jobs_completed = os.listdir(self.workload.job_monitoring_dir)
|
||||||
rootLogger.debug("Monitoring dir jobs completed: " + str(jobscompleted))
|
rootLogger.debug(f"Monitoring dir jobs completed: {monitored_jobs_completed}")
|
||||||
return jobscompleted
|
return monitored_jobs_completed
|
||||||
|
|
||||||
jobscompleted = get_jobs_completed_local_info()
|
# return all the state about the instance (potentially copy back results and/or terminate)
|
||||||
|
is_final_run = False
|
||||||
|
monitored_jobs_completed = get_jobs_completed_local_info()
|
||||||
# this job on the instance should return all the state about the instance
|
instancestates = execute(monitor_jobs_wrapper,
|
||||||
# e.g.:
|
self.run_farm,
|
||||||
# if an instance has been terminated (really - is termination
|
monitored_jobs_completed,
|
||||||
# requested and no jobs are left, then we will have implicitly
|
is_final_run,
|
||||||
# terminated
|
is_networked,
|
||||||
teardown = False
|
self.terminateoncompletion,
|
||||||
instancestates = execute(monitor_jobs_wrapper, self.run_farm,
|
self.workload.job_results_dir,
|
||||||
jobscompleted, teardown,
|
hosts=all_run_farm_ips)
|
||||||
self.terminateoncompletion,
|
|
||||||
self.workload.job_results_dir,
|
|
||||||
hosts=all_run_farm_ips)
|
|
||||||
|
|
||||||
# log sim state, raw
|
# log sim state, raw
|
||||||
rootLogger.debug(pprint.pformat(instancestates))
|
rootLogger.debug(pprint.pformat(instancestates))
|
||||||
|
@ -637,31 +639,37 @@ class FireSimTopologyWithPasses:
|
||||||
# log sim state, properly
|
# log sim state, properly
|
||||||
loop_logger(instancestates, self.terminateoncompletion)
|
loop_logger(instancestates, self.terminateoncompletion)
|
||||||
|
|
||||||
jobs_complete_dict = dict()
|
jobs_complete_dict = {}
|
||||||
simstates = [x['sims'] for x in instancestates.values()]
|
simstates = [x['sims'] for x in instancestates.values()]
|
||||||
for x in simstates:
|
for x in simstates:
|
||||||
jobs_complete_dict.update(x)
|
jobs_complete_dict.update(x)
|
||||||
global_status = jobs_complete_dict.values()
|
global_status = jobs_complete_dict.values()
|
||||||
rootLogger.debug("jobs complete dict " + str(jobs_complete_dict))
|
rootLogger.debug(f"Jobs complete: {jobs_complete_dict}")
|
||||||
rootLogger.debug("global status: " + str(global_status))
|
rootLogger.debug(f"Global status: {global_status}")
|
||||||
|
|
||||||
|
if is_networked and any(global_status):
|
||||||
|
# at least one simulation has finished
|
||||||
|
|
||||||
if teardown_required and any(global_status):
|
|
||||||
# in this case, do the teardown, then call exec again, then exit
|
# in this case, do the teardown, then call exec again, then exit
|
||||||
rootLogger.info("Teardown required, manually tearing down...")
|
rootLogger.info("Networked simulation, manually tearing down all instances...")
|
||||||
# do not disconnect nbds, because we may need them for copying
|
# do not disconnect nbds, because we may need them for copying
|
||||||
# results. the process of copying results will tear them down anyway
|
# results. the process of copying results will tear them down anyway
|
||||||
self.kill_simulation_passes(use_mock_instances_for_testing, disconnect_all_nbds=False)
|
self.kill_simulation_passes(use_mock_instances_for_testing, disconnect_all_nbds=False)
|
||||||
rootLogger.debug("continuing one more loop to fully copy results and terminate")
|
|
||||||
teardown = True
|
rootLogger.debug("One more loop to fully copy results and terminate.")
|
||||||
# get latest local info about jobs completed. avoid extra copy
|
is_final_run = True
|
||||||
jobscompleted = get_jobs_completed_local_info()
|
monitored_jobs_completed = get_jobs_completed_local_info()
|
||||||
instancestates = execute(monitor_jobs_wrapper, self.run_farm,
|
instancestates = execute(monitor_jobs_wrapper,
|
||||||
jobscompleted, teardown,
|
self.run_farm,
|
||||||
self.terminateoncompletion,
|
monitored_jobs_completed,
|
||||||
self.workload.job_results_dir,
|
is_final_run,
|
||||||
hosts=all_run_farm_ips)
|
is_networked,
|
||||||
|
self.terminateoncompletion,
|
||||||
|
self.workload.job_results_dir,
|
||||||
|
hosts=all_run_farm_ips)
|
||||||
break
|
break
|
||||||
if not teardown_required and all(global_status):
|
|
||||||
|
if not is_networked and all(global_status):
|
||||||
break
|
break
|
||||||
|
|
||||||
time.sleep(10)
|
time.sleep(10)
|
||||||
|
|
|
@ -18,8 +18,8 @@ from util.io import firesim_input
|
||||||
from runtools.run_farm_deploy_managers import InstanceDeployManager, EC2InstanceDeployManager
|
from runtools.run_farm_deploy_managers import InstanceDeployManager, EC2InstanceDeployManager
|
||||||
|
|
||||||
from typing import Any, Dict, Optional, List, Union, Set, Type, Tuple, TYPE_CHECKING
|
from typing import Any, Dict, Optional, List, Union, Set, Type, Tuple, TYPE_CHECKING
|
||||||
|
from mypy_boto3_ec2.service_resource import Instance as EC2InstanceResource
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from mypy_boto3_ec2.service_resource import Instance as EC2InstanceResource
|
|
||||||
from runtools.firesim_topology_elements import FireSimSwitchNode, FireSimServerNode
|
from runtools.firesim_topology_elements import FireSimSwitchNode, FireSimServerNode
|
||||||
|
|
||||||
rootLogger = logging.getLogger()
|
rootLogger = logging.getLogger()
|
||||||
|
@ -28,6 +28,7 @@ class Inst(metaclass=abc.ABCMeta):
|
||||||
"""Run farm hosts that can hold simulations or switches.
|
"""Run farm hosts that can hold simulations or switches.
|
||||||
|
|
||||||
Attributes:
|
Attributes:
|
||||||
|
run_farm: handle to run farm this instance is a part of
|
||||||
MAX_SWITCH_SLOTS_ALLOWED: max switch slots allowed (hardcoded)
|
MAX_SWITCH_SLOTS_ALLOWED: max switch slots allowed (hardcoded)
|
||||||
switch_slots: switch node slots
|
switch_slots: switch node slots
|
||||||
_next_switch_port: next switch port to assign
|
_next_switch_port: next switch port to assign
|
||||||
|
@ -39,6 +40,8 @@ class Inst(metaclass=abc.ABCMeta):
|
||||||
metasimulation_enabled: true if this instance will be running metasimulations
|
metasimulation_enabled: true if this instance will be running metasimulations
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
run_farm: RunFarm
|
||||||
|
|
||||||
# switch variables
|
# switch variables
|
||||||
# restricted by default security group network model port alloc (10000 to 11000)
|
# restricted by default security group network model port alloc (10000 to 11000)
|
||||||
MAX_SWITCH_SLOTS_ALLOWED: int = 1000
|
MAX_SWITCH_SLOTS_ALLOWED: int = 1000
|
||||||
|
@ -58,8 +61,11 @@ class Inst(metaclass=abc.ABCMeta):
|
||||||
|
|
||||||
metasimulation_enabled: bool
|
metasimulation_enabled: bool
|
||||||
|
|
||||||
def __init__(self, max_sim_slots_allowed: int, instance_deploy_manager: Type[InstanceDeployManager], sim_dir: Optional[str] = None, metasimulation_enabled: bool = False) -> None:
|
def __init__(self, run_farm: RunFarm, max_sim_slots_allowed: int, instance_deploy_manager: Type[InstanceDeployManager], sim_dir: Optional[str] = None, metasimulation_enabled: bool = False) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
|
self.run_farm = run_farm
|
||||||
|
|
||||||
self.switch_slots = []
|
self.switch_slots = []
|
||||||
self._next_switch_port = 10000 # track ports to allocate for server switch model ports
|
self._next_switch_port = 10000 # track ports to allocate for server switch model ports
|
||||||
|
|
||||||
|
@ -111,6 +117,10 @@ class Inst(metaclass=abc.ABCMeta):
|
||||||
""" Return True iff any simulation on this Inst requires qcow2. """
|
""" Return True iff any simulation on this Inst requires qcow2. """
|
||||||
return any([x.qcow2_support_required() for x in self.sim_slots])
|
return any([x.qcow2_support_required() for x in self.sim_slots])
|
||||||
|
|
||||||
|
def terminate_self(self) -> None:
|
||||||
|
""" Terminate the current host for the Inst. """
|
||||||
|
self.run_farm.terminate_by_inst(self)
|
||||||
|
|
||||||
class RunFarm(metaclass=abc.ABCMeta):
|
class RunFarm(metaclass=abc.ABCMeta):
|
||||||
"""Abstract class to represent how to manage run farm hosts (similar to `BuildFarm`).
|
"""Abstract class to represent how to manage run farm hosts (similar to `BuildFarm`).
|
||||||
In addition to having to implement how to spawn/terminate nodes, the child classes must
|
In addition to having to implement how to spawn/terminate nodes, the child classes must
|
||||||
|
@ -250,6 +260,11 @@ class RunFarm(metaclass=abc.ABCMeta):
|
||||||
"""Return run farm host based on host."""
|
"""Return run farm host based on host."""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
|
def terminate_by_inst(self, inst: Inst) -> None:
|
||||||
|
"""Terminate run farm host based on Inst object."""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
def invert_filter_sort(input_dict: Dict[str, int]) -> List[Tuple[int, str]]:
|
def invert_filter_sort(input_dict: Dict[str, int]) -> List[Tuple[int, str]]:
|
||||||
"""Take a dict, convert to list of pairs, flip key and value,
|
"""Take a dict, convert to list of pairs, flip key and value,
|
||||||
remove all keys equal to zero, then sort on the new key."""
|
remove all keys equal to zero, then sort on the new key."""
|
||||||
|
@ -350,7 +365,7 @@ class AWSEC2F1(RunFarm):
|
||||||
|
|
||||||
insts: List[Tuple[Inst, Optional[Union[EC2InstanceResource, MockBoto3Instance]]]] = []
|
insts: List[Tuple[Inst, Optional[Union[EC2InstanceResource, MockBoto3Instance]]]] = []
|
||||||
for _ in range(num_insts):
|
for _ in range(num_insts):
|
||||||
insts.append((Inst(num_sim_slots, dispatch_dict[platform], simulation_dir, self.metasimulation_enabled), None))
|
insts.append((Inst(self, num_sim_slots, dispatch_dict[platform], simulation_dir, self.metasimulation_enabled), None))
|
||||||
self.run_farm_hosts_dict[inst_handle] = insts
|
self.run_farm_hosts_dict[inst_handle] = insts
|
||||||
self.mapper_consumed[inst_handle] = 0
|
self.mapper_consumed[inst_handle] = 0
|
||||||
|
|
||||||
|
@ -496,6 +511,18 @@ class AWSEC2F1(RunFarm):
|
||||||
return host_node
|
return host_node
|
||||||
assert False, f"Unable to find host node by {host}"
|
assert False, f"Unable to find host node by {host}"
|
||||||
|
|
||||||
|
def terminate_by_inst(self, inst: Inst) -> None:
|
||||||
|
"""Terminate run farm host based on host."""
|
||||||
|
for sim_host_handle in sorted(self.SIM_HOST_HANDLE_TO_MAX_FPGA_SLOTS):
|
||||||
|
inst_list = self.run_farm_hosts_dict[sim_host_handle]
|
||||||
|
for inner_inst, boto in inst_list:
|
||||||
|
if inner_inst.get_host() == inst.get_host():
|
||||||
|
# EC2InstanceResource can only be used for typing checks
|
||||||
|
# preventing its use for the isinstance() check
|
||||||
|
assert boto is not None and not isinstance(boto, MockBoto3Instance)
|
||||||
|
instanceids = get_instance_ids_for_instances([boto])
|
||||||
|
terminate_instances(instanceids, dryrun=False)
|
||||||
|
|
||||||
class ExternallyProvisioned(RunFarm):
|
class ExternallyProvisioned(RunFarm):
|
||||||
"""This manages the set of externally provisioned instances. This class doesn't manage
|
"""This manages the set of externally provisioned instances. This class doesn't manage
|
||||||
launch/terminating instances. It is assumed that the instances are "ready to use".
|
launch/terminating instances. It is assumed that the instances are "ready to use".
|
||||||
|
@ -552,7 +579,7 @@ class ExternallyProvisioned(RunFarm):
|
||||||
platform = host_spec.get("override_platform", default_platform)
|
platform = host_spec.get("override_platform", default_platform)
|
||||||
simulation_dir = host_spec.get("override_simulation_dir", self.default_simulation_dir)
|
simulation_dir = host_spec.get("override_simulation_dir", self.default_simulation_dir)
|
||||||
|
|
||||||
inst = Inst(num_sims, dispatch_dict[platform], simulation_dir, self.metasimulation_enabled)
|
inst = Inst(self, num_sims, dispatch_dict[platform], simulation_dir, self.metasimulation_enabled)
|
||||||
inst.set_host(ip_addr)
|
inst.set_host(ip_addr)
|
||||||
assert not ip_addr in self.run_farm_hosts_dict, f"Duplicate host name found in 'run_farm_hosts': {ip_addr}"
|
assert not ip_addr in self.run_farm_hosts_dict, f"Duplicate host name found in 'run_farm_hosts': {ip_addr}"
|
||||||
self.run_farm_hosts_dict[ip_addr] = [(inst, None)]
|
self.run_farm_hosts_dict[ip_addr] = [(inst, None)]
|
||||||
|
@ -586,3 +613,7 @@ class ExternallyProvisioned(RunFarm):
|
||||||
if host_node.get_host() == host:
|
if host_node.get_host() == host:
|
||||||
return host_node
|
return host_node
|
||||||
assert False, f"Unable to find host node by {host} host name"
|
assert False, f"Unable to find host node by {host} host name"
|
||||||
|
|
||||||
|
def terminate_by_inst(self, inst: Inst) -> None:
|
||||||
|
rootLogger.info(f"WARNING: Skipping terminate_by_inst since run hosts are externally provisioned.")
|
||||||
|
return
|
||||||
|
|
|
@ -11,11 +11,9 @@ from fabric.contrib.project import rsync_project # type: ignore
|
||||||
import time
|
import time
|
||||||
from os.path import join as pjoin
|
from os.path import join as pjoin
|
||||||
|
|
||||||
from awstools.awstools import terminate_instances, get_instance_ids_for_instances
|
|
||||||
from runtools.utils import has_sudo
|
from runtools.utils import has_sudo
|
||||||
|
|
||||||
from typing import List, Dict, Optional, Union, TYPE_CHECKING
|
from typing import List, Dict, Optional, Union, TYPE_CHECKING
|
||||||
from mypy_boto3_ec2.service_resource import Instance as EC2InstanceResource
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from runtools.firesim_topology_elements import FireSimSwitchNode, FireSimServerNode
|
from runtools.firesim_topology_elements import FireSimSwitchNode, FireSimServerNode
|
||||||
from runtools.run_farm import Inst
|
from runtools.run_farm import Inst
|
||||||
|
@ -96,9 +94,12 @@ class InstanceDeployManager(metaclass=abc.ABCMeta):
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def instance_logger(self, logstr: str) -> None:
|
def instance_logger(self, logstr: str, debug: bool = False) -> None:
|
||||||
""" Log with this host's info as prefix. """
|
""" Log with this host's info as prefix. """
|
||||||
rootLogger.info("""[{}] """.format(env.host_string) + logstr)
|
if debug:
|
||||||
|
rootLogger.debug("""[{}] """.format(env.host_string) + logstr)
|
||||||
|
else:
|
||||||
|
rootLogger.info("""[{}] """.format(env.host_string) + logstr)
|
||||||
|
|
||||||
def sim_node_qcow(self) -> None:
|
def sim_node_qcow(self) -> None:
|
||||||
""" If NBD is available and qcow2 support is required, install qemu-img
|
""" If NBD is available and qcow2 support is required, install qemu-img
|
||||||
|
@ -305,122 +306,125 @@ class InstanceDeployManager(metaclass=abc.ABCMeta):
|
||||||
switches.append(line_stripped)
|
switches.append(line_stripped)
|
||||||
return {'switches': switches, 'simdrivers': simdrivers}
|
return {'switches': switches, 'simdrivers': simdrivers}
|
||||||
|
|
||||||
def monitor_jobs_instance(self, completed_jobs: List[str], teardown: bool, terminateoncompletion: bool,
|
def monitor_jobs_instance(self,
|
||||||
|
prior_completed_jobs: List[str],
|
||||||
|
is_final_loop: bool,
|
||||||
|
is_networked: bool,
|
||||||
|
terminateoncompletion: bool,
|
||||||
job_results_dir: str) -> Dict[str, Dict[str, bool]]:
|
job_results_dir: str) -> Dict[str, Dict[str, bool]]:
|
||||||
""" Job monitoring for this host. """
|
""" Job monitoring for this host. """
|
||||||
# make a local copy of completed_jobs, so that we can update it
|
self.instance_logger(f"Final loop?: {is_final_loop} Is networked?: {is_networked} Terminateoncomplete: {terminateoncompletion}", debug=True)
|
||||||
completed_jobs = list(completed_jobs)
|
self.instance_logger(f"Prior completed jobs: {prior_completed_jobs}", debug=True)
|
||||||
|
|
||||||
|
def do_terminate():
|
||||||
|
if (not is_networked) or (is_networked and is_final_loop):
|
||||||
|
if terminateoncompletion:
|
||||||
|
self.terminate_instance()
|
||||||
|
|
||||||
rootLogger.debug("completed jobs " + str(completed_jobs))
|
|
||||||
|
|
||||||
if not self.instance_assigned_simulations() and self.instance_assigned_switches():
|
if not self.instance_assigned_simulations() and self.instance_assigned_switches():
|
||||||
# this node hosts ONLY switches and not sims
|
self.instance_logger(f"Polling switch-only node", debug=True)
|
||||||
#
|
|
||||||
# just confirm that our switches are still running
|
# just confirm that our switches are still running
|
||||||
# switches will never trigger shutdown in the cycle-accurate -
|
# switches will never trigger shutdown in the cycle-accurate -
|
||||||
# they should run forever until torn down
|
# they should run forever until torn down
|
||||||
if teardown:
|
if is_final_loop:
|
||||||
# handle the case where we're just tearing down nodes that have
|
self.instance_logger(f"Completing copies for switch-only node", debug=True)
|
||||||
# ONLY switches
|
|
||||||
for counter in range(len(self.parent_node.switch_slots)):
|
for counter in range(len(self.parent_node.switch_slots)):
|
||||||
switchsim = self.parent_node.switch_slots[counter]
|
switchsim = self.parent_node.switch_slots[counter]
|
||||||
switchsim.copy_back_switchlog_from_run(job_results_dir, counter)
|
switchsim.copy_back_switchlog_from_run(job_results_dir, counter)
|
||||||
|
|
||||||
if terminateoncompletion:
|
do_terminate()
|
||||||
# terminate the instance since teardown is called and instance
|
|
||||||
# termination is enabled
|
|
||||||
self.terminate_instance()
|
|
||||||
|
|
||||||
# don't really care about the return val in the teardown case
|
return {'switches': {}, 'sims': {}}
|
||||||
return {'switches': dict(), 'sims': dict()}
|
else:
|
||||||
|
# get the status of the switch sims
|
||||||
# not teardown - just get the status of the switch sims
|
switchescompleteddict = {k: False for k in self.running_simulations()['switches']}
|
||||||
switchescompleteddict = {k: False for k in self.running_simulations()['switches']}
|
|
||||||
for switchsim in self.parent_node.switch_slots:
|
|
||||||
swname = switchsim.switch_builder.switch_binary_name()
|
|
||||||
if swname not in switchescompleteddict.keys():
|
|
||||||
switchescompleteddict[swname] = True
|
|
||||||
return {'switches': switchescompleteddict, 'sims': dict()}
|
|
||||||
|
|
||||||
if self.instance_assigned_simulations():
|
|
||||||
# this node has sims attached
|
|
||||||
|
|
||||||
# first, figure out which jobs belong to this instance.
|
|
||||||
# if they are all completed already. RETURN, DON'T TRY TO DO ANYTHING
|
|
||||||
# ON THE INSTNACE.
|
|
||||||
parentslots = self.parent_node.sim_slots
|
|
||||||
rootLogger.debug("parentslots " + str(parentslots))
|
|
||||||
jobnames = [slot.get_job_name() for slot in parentslots if slot is not None]
|
|
||||||
rootLogger.debug("jobnames " + str(jobnames))
|
|
||||||
already_done = all([job in completed_jobs for job in jobnames])
|
|
||||||
rootLogger.debug("already done? " + str(already_done))
|
|
||||||
if already_done:
|
|
||||||
# in this case, all of the nodes jobs have already completed. do nothing.
|
|
||||||
# this can never happen in the cycle-accurate case at a point where we care
|
|
||||||
# about switch status, so don't bother to populate it
|
|
||||||
jobnames_to_completed = {jname: True for jname in jobnames}
|
|
||||||
return {'sims': jobnames_to_completed, 'switches': dict()}
|
|
||||||
|
|
||||||
# at this point, all jobs are NOT completed. so, see how they're doing now:
|
|
||||||
instance_screen_status = self.running_simulations()
|
|
||||||
switchescompleteddict = {k: False for k in instance_screen_status['switches']}
|
|
||||||
|
|
||||||
if self.instance_assigned_switches():
|
|
||||||
# fill in whether switches have terminated for some reason
|
|
||||||
for switchsim in self.parent_node.switch_slots:
|
for switchsim in self.parent_node.switch_slots:
|
||||||
swname = switchsim.switch_builder.switch_binary_name()
|
swname = switchsim.switch_builder.switch_binary_name()
|
||||||
if swname not in switchescompleteddict.keys():
|
if swname not in switchescompleteddict.keys():
|
||||||
switchescompleteddict[swname] = True
|
switchescompleteddict[swname] = True
|
||||||
|
|
||||||
slotsrunning = [x for x in instance_screen_status['simdrivers']]
|
return {'switches': switchescompleteddict, 'sims': {}}
|
||||||
|
|
||||||
rootLogger.debug("slots running")
|
if self.instance_assigned_simulations():
|
||||||
rootLogger.debug(slotsrunning)
|
# this node has sims attached
|
||||||
|
self.instance_logger(f"Polling node with simulations (and potentially switches)", debug=True)
|
||||||
|
|
||||||
|
|
||||||
|
sim_slots = self.parent_node.sim_slots
|
||||||
|
jobnames = [slot.get_job_name() for slot in sim_slots]
|
||||||
|
all_jobs_completed = all([(job in prior_completed_jobs) for job in jobnames])
|
||||||
|
|
||||||
|
self.instance_logger(f"jobnames: {jobnames}", debug=True)
|
||||||
|
self.instance_logger(f"All jobs completed?: {all_jobs_completed}", debug=True)
|
||||||
|
|
||||||
|
if all_jobs_completed:
|
||||||
|
do_terminate()
|
||||||
|
|
||||||
|
# in this case, all of the nodes jobs have already completed. do nothing.
|
||||||
|
# this can never happen in the cycle-accurate case at a point where we care
|
||||||
|
# about switch status, so don't bother to populate it
|
||||||
|
jobnames_to_completed = {jname: True for jname in jobnames}
|
||||||
|
return {'sims': jobnames_to_completed, 'switches': {}}
|
||||||
|
|
||||||
|
# at this point, all jobs are NOT completed. so, see how they're doing now:
|
||||||
|
instance_screen_status = self.running_simulations()
|
||||||
|
|
||||||
|
switchescompleteddict = {k: False for k in instance_screen_status['switches']}
|
||||||
|
slotsrunning = [x for x in instance_screen_status['simdrivers']]
|
||||||
|
self.instance_logger(f"Switch Slots running: {switchescompleteddict}", debug=True)
|
||||||
|
self.instance_logger(f"Sim Slots running: {slotsrunning}", debug=True)
|
||||||
|
|
||||||
|
if self.instance_assigned_switches():
|
||||||
|
# fill in whether switches have terminated
|
||||||
|
for switchsim in self.parent_node.switch_slots:
|
||||||
|
sw_name = switchsim.switch_builder.switch_binary_name()
|
||||||
|
if sw_name not in switchescompleteddict.keys():
|
||||||
|
switchescompleteddict[sw_name] = True
|
||||||
|
|
||||||
|
# fill in whether sims have terminated
|
||||||
|
completed_jobs = prior_completed_jobs.copy() # create local copy to append to
|
||||||
for slotno, jobname in enumerate(jobnames):
|
for slotno, jobname in enumerate(jobnames):
|
||||||
if str(slotno) not in slotsrunning and jobname not in completed_jobs:
|
if (str(slotno) not in slotsrunning) and (jobname not in completed_jobs):
|
||||||
self.instance_logger("Slot " + str(slotno) + " completed! copying results.")
|
self.instance_logger(f"Slot {slotno}, Job {jobname} completed!")
|
||||||
# NOW, we must copy off the results of this sim, since it just exited
|
|
||||||
parent = parentslots[slotno]
|
|
||||||
parent.copy_back_job_results_from_run(slotno, has_sudo())
|
|
||||||
# add our job to our copy of completed_jobs, so that next,
|
|
||||||
# we can test again to see if this instance is "done" and
|
|
||||||
# can be terminated
|
|
||||||
completed_jobs.append(jobname)
|
completed_jobs.append(jobname)
|
||||||
|
|
||||||
# determine if we're done now.
|
# this writes the job monitoring file
|
||||||
jobs_done_q = {job: job in completed_jobs for job in jobnames}
|
sim_slots[slotno].copy_back_job_results_from_run(slotno, has_sudo())
|
||||||
now_done = all(jobs_done_q.values())
|
|
||||||
rootLogger.debug("now done: " + str(now_done))
|
|
||||||
if now_done and self.instance_assigned_switches():
|
|
||||||
# we're done AND we have switches running here, so kill them,
|
|
||||||
# then copy off their logs. this handles the case where you
|
|
||||||
# have a node with one simulation and some switches, to make
|
|
||||||
# sure the switch logs are copied off.
|
|
||||||
#
|
|
||||||
# the other cases are when you have multiple sims and a cycle-acc network,
|
|
||||||
# in which case the all() will never actually happen (unless someone builds
|
|
||||||
# a workload where two sims exit at exactly the same time, which we should
|
|
||||||
# advise users not to do)
|
|
||||||
#
|
|
||||||
# a last use case is when there's no network, in which case
|
|
||||||
# instance_assigned_switches won't be true, so this won't be called
|
|
||||||
|
|
||||||
self.kill_switches_instance()
|
jobs_complete_dict = {job: job in completed_jobs for job in jobnames}
|
||||||
|
now_all_jobs_complete = all(jobs_complete_dict.values())
|
||||||
|
self.instance_logger(f"Now done?: {now_all_jobs_complete}", debug=True)
|
||||||
|
|
||||||
for counter, switchsim in enumerate(self.parent_node.switch_slots):
|
if now_all_jobs_complete:
|
||||||
switchsim.copy_back_switchlog_from_run(job_results_dir, counter)
|
if self.instance_assigned_switches():
|
||||||
|
# we have switches running here, so kill them,
|
||||||
|
# then copy off their logs. this handles the case where you
|
||||||
|
# have a node with one simulation and some switches, to make
|
||||||
|
# sure the switch logs are copied off.
|
||||||
|
#
|
||||||
|
# the other cases are when you have multiple sims and a cycle-acc network,
|
||||||
|
# in which case the all() will never actually happen (unless someone builds
|
||||||
|
# a workload where two sims exit at exactly the same time, which we should
|
||||||
|
# advise users not to do)
|
||||||
|
#
|
||||||
|
# a last use case is when there's no network, in which case
|
||||||
|
# instance_assigned_switches won't be true, so this won't be called
|
||||||
|
|
||||||
if now_done and terminateoncompletion:
|
self.kill_switches_instance()
|
||||||
# terminate the instance since everything is done and instance
|
|
||||||
# termination is enabled
|
|
||||||
self.terminate_instance()
|
|
||||||
|
|
||||||
return {'switches': switchescompleteddict, 'sims': jobs_done_q}
|
for counter, switch_slot in enumerate(self.parent_node.switch_slots):
|
||||||
|
switch_slot.copy_back_switchlog_from_run(job_results_dir, counter)
|
||||||
|
|
||||||
|
do_terminate()
|
||||||
|
|
||||||
|
return {'switches': switchescompleteddict, 'sims': jobs_complete_dict}
|
||||||
|
|
||||||
assert False
|
assert False
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def remote_kmsg(message: str) -> None:
|
def remote_kmsg(message: str) -> None:
|
||||||
""" This will let you write whatever is passed as message into the kernel
|
""" This will let you write whatever is passed as message into the kernel
|
||||||
log of the remote machine. Useful for figuring what the manager is doing
|
log of the remote machine. Useful for figuring what the manager is doing
|
||||||
|
@ -435,11 +439,9 @@ class EC2InstanceDeployManager(InstanceDeployManager):
|
||||||
|
|
||||||
This is in charge of managing the locations of stuff on remote nodes.
|
This is in charge of managing the locations of stuff on remote nodes.
|
||||||
"""
|
"""
|
||||||
boto3_instance_object: Optional[Union[EC2InstanceResource, MockBoto3Instance]]
|
|
||||||
|
|
||||||
def __init__(self, parent_node: Inst) -> None:
|
def __init__(self, parent_node: Inst) -> None:
|
||||||
super().__init__(parent_node)
|
super().__init__(parent_node)
|
||||||
self.boto3_instance_object = None
|
|
||||||
self.nbd_tracker = NBDTracker()
|
self.nbd_tracker = NBDTracker()
|
||||||
|
|
||||||
def get_and_install_aws_fpga_sdk(self) -> None:
|
def get_and_install_aws_fpga_sdk(self) -> None:
|
||||||
|
@ -618,10 +620,8 @@ class EC2InstanceDeployManager(InstanceDeployManager):
|
||||||
self.copy_switch_slot_infrastructure(slotno)
|
self.copy_switch_slot_infrastructure(slotno)
|
||||||
|
|
||||||
def terminate_instance(self) -> None:
|
def terminate_instance(self) -> None:
|
||||||
assert isinstance(self.boto3_instance_object, EC2InstanceResource)
|
self.instance_logger("Terminating instance", debug=True)
|
||||||
instanceids = get_instance_ids_for_instances([self.boto3_instance_object])
|
self.parent_node.terminate_self()
|
||||||
terminate_instances(instanceids, dryrun=False)
|
|
||||||
|
|
||||||
|
|
||||||
class VitisInstanceDeployManager(InstanceDeployManager):
|
class VitisInstanceDeployManager(InstanceDeployManager):
|
||||||
""" This class manages a Vitis-enabled instance """
|
""" This class manages a Vitis-enabled instance """
|
||||||
|
@ -665,5 +665,4 @@ class VitisInstanceDeployManager(InstanceDeployManager):
|
||||||
|
|
||||||
def terminate_instance(self) -> None:
|
def terminate_instance(self) -> None:
|
||||||
""" VitisInstanceDeployManager machines cannot be terminated. """
|
""" VitisInstanceDeployManager machines cannot be terminated. """
|
||||||
pass
|
return
|
||||||
|
|
||||||
|
|
|
@ -38,7 +38,7 @@ autocounter:
|
||||||
|
|
||||||
workload:
|
workload:
|
||||||
workload_name: linux-poweroff-uniform.json
|
workload_name: linux-poweroff-uniform.json
|
||||||
terminate_on_completion: no
|
terminate_on_completion: yes
|
||||||
suffix_tag: null
|
suffix_tag: null
|
||||||
|
|
||||||
host_debug:
|
host_debug:
|
||||||
|
|
|
@ -36,7 +36,7 @@ autocounter:
|
||||||
|
|
||||||
workload:
|
workload:
|
||||||
workload_name: linux-poweroff-uniform.json
|
workload_name: linux-poweroff-uniform.json
|
||||||
terminate_on_completion: no
|
terminate_on_completion: yes
|
||||||
suffix_tag: null
|
suffix_tag: null
|
||||||
|
|
||||||
host_debug:
|
host_debug:
|
||||||
|
|
Loading…
Reference in New Issue