forked from TensorLayer/tensorlayer3
172 lines
5.8 KiB
Python
172 lines
5.8 KiB
Python
#! /usr/bin/python
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
tl train
|
|
========
|
|
|
|
(Alpha release - usage might change later)
|
|
|
|
The tensorlayer.cli.train module provides the ``tl train`` subcommand.
|
|
It helps the user bootstrap a TensorFlow/TensorLayer program for distributed training
|
|
using multiple GPU cards or CPUs on a computer.
|
|
|
|
You need to first setup the `CUDA_VISIBLE_DEVICES <http://acceleware.com/blog/cudavisibledevices-masking-gpus>`_
|
|
to tell ``tl train`` which GPUs are available. If the CUDA_VISIBLE_DEVICES is not given,
|
|
``tl train`` would try best to discover all available GPUs.
|
|
|
|
In distribute training, each TensorFlow program needs a TF_CONFIG environment variable to describe
|
|
the cluster. It also needs a master daemon to
|
|
monitor all trainers. ``tl train`` is responsible
|
|
for automatically managing these two tasks.
|
|
|
|
Usage
|
|
-----
|
|
|
|
tl train [-h] [-p NUM_PSS] [-c CPU_TRAINERS] <file> [args [args ...]]
|
|
|
|
.. code-block:: bash
|
|
|
|
# example of using GPU 0 and 1 for training mnist
|
|
CUDA_VISIBLE_DEVICES="0,1"
|
|
tl train example/tutorial_mnist_distributed.py
|
|
|
|
# example of using CPU trainers for inception v3
|
|
tl train -c 16 example/tutorial_imagenet_inceptionV3_distributed.py
|
|
|
|
# example of using GPU trainers for inception v3 with customized arguments
|
|
# as CUDA_VISIBLE_DEVICES is not given, tl would try to discover all available GPUs
|
|
tl train example/tutorial_imagenet_inceptionV3_distributed.py -- --batch_size 16
|
|
|
|
|
|
Command-line Arguments
|
|
----------------------
|
|
|
|
- ``file``: python file path.
|
|
|
|
- ``NUM_PSS`` : The number of parameter servers.
|
|
|
|
- ``CPU_TRAINERS``: The number of CPU trainers.
|
|
|
|
It is recommended that ``NUM_PSS + CPU_TRAINERS <= cpu count``
|
|
|
|
- ``args``: Any parameter after ``--`` would be passed to the python program.
|
|
|
|
|
|
Notes
|
|
-----
|
|
A parallel training program would require multiple parameter servers
|
|
to help parallel trainers to exchange intermediate gradients.
|
|
The best number of parameter servers is often proportional to the
|
|
size of your model as well as the number of CPUs available.
|
|
You can control the number of parameter servers using the ``-p`` parameter.
|
|
|
|
If you have a single computer with massive CPUs, you can use the ``-c`` parameter
|
|
to enable CPU-only parallel training.
|
|
The reason we are not supporting GPU-CPU co-training is because GPU and
|
|
CPU are running at different speeds. Using them together in training would
|
|
incur stragglers.
|
|
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import multiprocessing
|
|
import os
|
|
import platform
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
|
|
PORT_BASE = 10000
|
|
|
|
|
|
def _get_gpu_ids():
|
|
if 'CUDA_VISIBLE_DEVICES' in os.environ:
|
|
return [int(x) for x in os.environ.get('CUDA_VISIBLE_DEVICES', '').split(',')]
|
|
if platform.system() in ['Darwin', 'Linux']:
|
|
return [int(d.replace('nvidia', '')) for d in os.listdir('/dev') if re.match('^nvidia\d+$', d)]
|
|
else:
|
|
print('Please set CUDA_VISIBLE_DEVICES (see http://acceleware.com/blog/cudavisibledevices-masking-gpus)')
|
|
return []
|
|
|
|
|
|
GPU_IDS = _get_gpu_ids()
|
|
|
|
|
|
def create_tf_config(cluster_spec, task_type, task_index):
|
|
return {
|
|
'cluster': cluster_spec,
|
|
'task': {
|
|
'type': task_type,
|
|
'index': task_index
|
|
},
|
|
}
|
|
|
|
|
|
def create_tf_jobs(cluster_spec, prog, args):
|
|
gpu_assignment = dict((('worker', idx), gpu_idx) for (idx, gpu_idx) in enumerate(GPU_IDS))
|
|
for job_type in cluster_spec:
|
|
for task_index in range(len(cluster_spec[job_type])):
|
|
new_env = os.environ.copy()
|
|
new_env.update(
|
|
{
|
|
'CUDA_VISIBLE_DEVICES': str(gpu_assignment.get((job_type, task_index), '')),
|
|
'TF_CONFIG': json.dumps(create_tf_config(cluster_spec, job_type, task_index)),
|
|
}
|
|
)
|
|
yield subprocess.Popen(['python3', prog] + args, env=new_env)
|
|
|
|
|
|
def validate_arguments(args):
|
|
if args.num_pss < 1:
|
|
print('Value error: must have ore than one parameter servers.')
|
|
exit(1)
|
|
|
|
if not GPU_IDS:
|
|
num_cpus = multiprocessing.cpu_count()
|
|
if args.cpu_trainers > num_cpus:
|
|
print('Value error: there are %s available CPUs but you are requiring %s.' % (num_cpus, args.cpu_trainers))
|
|
exit(1)
|
|
|
|
if not os.path.isfile(args.file):
|
|
print('Value error: model trainning file does not exist')
|
|
exit(1)
|
|
|
|
|
|
def main(args):
|
|
validate_arguments(args)
|
|
num_workers = len(GPU_IDS) if GPU_IDS else args.cpu_trainers
|
|
print('Using program %s with args %s' % (args.file, ' '.join(args.args)))
|
|
print('Using %d workers, %d parameter servers, %d GPUs.' % (num_workers, args.num_pss, len(GPU_IDS)))
|
|
cluster_spec = {
|
|
'ps': ['localhost: %d' % (PORT_BASE + i) for i in range(args.num_pss)],
|
|
'worker': ['localhost: %d' % (PORT_BASE + args.num_pss + i) for i in range(num_workers)]
|
|
}
|
|
processes = list(create_tf_jobs(cluster_spec, args.file, args.args))
|
|
try:
|
|
print('Press ENTER to exit the training ...')
|
|
sys.stdin.readline()
|
|
except KeyboardInterrupt: # https://docs.python.org/3/library/exceptions.html#KeyboardInterrupt
|
|
print('Keyboard interrupt received')
|
|
finally:
|
|
print('stopping all subprocesses ...')
|
|
for p in processes:
|
|
p.kill()
|
|
for p in processes:
|
|
p.wait()
|
|
print('END')
|
|
|
|
|
|
def build_arg_parser(parser):
|
|
parser.add_argument('-p', '--pss', dest='num_pss', type=int, default=1, help='number of parameter servers')
|
|
parser.add_argument('-c', '--cpu_trainers', dest='cpu_trainers', type=int, default=1, help='number of CPU trainers')
|
|
parser.add_argument('file', help='model trainning file path')
|
|
parser.add_argument('args', nargs='*', type=str, help='arguments to <file>')
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser()
|
|
build_arg_parser(parser)
|
|
args = parser.parse_args()
|
|
main(args)
|