From bc2571e61c985ec82819cf01ad038342771c94d0 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Mon, 26 Apr 2021 10:40:32 -0700 Subject: [PATCH] [Deepspeed] ZeRO-Infinity integration plus config revamp (#11418) * adding Z-inf * revamp config process * up version requirement * wip * massive rewrite * cleanup * cleanup * Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * consistent json commas * act on suggestions * leave this feature for 0.3.16 * style Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- docs/source/main_classes/trainer.rst | 772 ++++++++++++------ setup.py | 2 +- src/transformers/dependency_versions_table.py | 2 +- src/transformers/integrations.py | 364 +++++---- src/transformers/modeling_utils.py | 6 +- src/transformers/training_args.py | 11 +- tests/deepspeed/ds_config_zero2.json | 44 +- tests/deepspeed/ds_config_zero3.json | 57 +- tests/deepspeed/test_deepspeed.py | 126 ++- tests/test_trainer.py | 15 +- 10 files changed, 896 insertions(+), 503 deletions(-) diff --git a/docs/source/main_classes/trainer.rst b/docs/source/main_classes/trainer.rst index 106ef3c80e..cdc796c017 100644 --- a/docs/source/main_classes/trainer.rst +++ b/docs/source/main_classes/trainer.rst @@ -400,18 +400,18 @@ DeepSpeed `DeepSpeed `__ implements everything described in the `ZeRO paper `__. Currently it provides full support for: -1. Optimizer State Partitioning (ZeRO stage 1) -2. Gradient Partitioning (ZeRO stage 2) -3. Param Partitioning (ZeRO stage 3) +1. Optimizer state partitioning (ZeRO stage 1) +2. Gradient partitioning (ZeRO stage 2) +3. Parameter partitioning (ZeRO stage 3) 4. Custom mixed precision training handling -5. A range of fast CUDA-extension-based Optimizers -6. ZeRO-Offload +5. A range of fast CUDA-extension-based optimizers +6. ZeRO-Offload to CPU and NVMe ZeRO-Offload has its own dedicated paper: `ZeRO-Offload: Democratizing Billion-Scale Model Training -`__. +`__. And NVMe-support is described in the paper `ZeRO-Infinity: Breaking the GPU +Memory Wall for Extreme Scale Deep Learning `__. -DeepSpeed ZeRO-2 is currently used only for training, as all the currently available features are of no use to -inference. +DeepSpeed ZeRO-2 is primarily used only for training, as its features are of no use to inference. DeepSpeed ZeRO-3 can be used for inference as well, since it allows huge models to be loaded on multiple GPUs, which won't be possible on a single GPU. @@ -541,7 +541,7 @@ Here is an example of running ``run_translation.py`` under DeepSpeed deploying a .. code-block:: bash deepspeed examples/pytorch/translation/run_translation.py \ - --deepspeed tests/deepspeed/ds_config.json \ + --deepspeed tests/deepspeed/ds_config_zero3.json \ --model_name_or_path t5-small --per_device_train_batch_size 1 \ --output_dir output_dir --overwrite_output_dir --fp16 \ --do_train --max_train_samples 500 --num_train_epochs 1 \ @@ -566,17 +566,17 @@ To deploy DeepSpeed with one GPU adjust the :class:`~transformers.Trainer` comma .. code-block:: bash deepspeed --num_gpus=1 examples/pytorch/translation/run_translation.py \ - --deepspeed tests/deepspeed/ds_config.json \ + --deepspeed tests/deepspeed/ds_config_zero2.json \ --model_name_or_path t5-small --per_device_train_batch_size 1 \ --output_dir output_dir --overwrite_output_dir --fp16 \ --do_train --max_train_samples 500 --num_train_epochs 1 \ --dataset_name wmt16 --dataset_config "ro-en" \ --source_lang en --target_lang ro -This is almost the same as with multiple-GPUs, but here we tell DeepSpeed explicitly to use just one GPU. By default, -DeepSpeed deploys all GPUs it can see. If you have only 1 GPU to start with, then you don't need this argument. The -following `documentation `__ discusses the -launcher options. +This is almost the same as with multiple-GPUs, but here we tell DeepSpeed explicitly to use just one GPU via +``--num_gpus=1``. By default, DeepSpeed deploys all GPUs it can see on the given node. If you have only 1 GPU to start +with, then you don't need this argument. The following `documentation +`__ discusses the launcher options. Why would you want to use DeepSpeed with just one GPU? @@ -601,7 +601,7 @@ with DeepSpeed is to have at least the following configuration in the configurat "overlap_comm": true, "contiguous_gradients": true, "cpu_offload": true - }, + } } which enables ``cpu_offload`` and some other important features. You may experiment with the buffer sizes, you will @@ -610,6 +610,11 @@ find more details in the discussion below. For a practical usage example of this type of deployment, please, see this `post `__. +You may also try the ZeRO-3 with CPU and NVMe offload as explained further in this document. + + + Notes: - if you need to run on a specific GPU, which is different from GPU 0, you can't use ``CUDA_VISIBLE_DEVICES`` to limit @@ -643,7 +648,7 @@ If you're using only 1 GPU, here is how you'd have to adjust your training code os.environ['WORLD_SIZE'] = "1" # Now proceed as normal, plus pass the deepspeed config file - training_args = TrainingArguments(..., deepspeed="ds_config.json") + training_args = TrainingArguments(..., deepspeed="ds_config_zero3.json") trainer = Trainer(...) trainer.train() @@ -659,47 +664,62 @@ cell with: .. code-block:: python %%bash - cat <<'EOT' > ds_config.json + cat <<'EOT' > ds_config_zero3.json { "fp16": { - "enabled": true, + "enabled": "auto", "loss_scale": 0, "loss_scale_window": 1000, + "initial_scale_power": 16, "hysteresis": 2, "min_loss_scale": 1 }, - "zero_optimization": { - "stage": 2, - "allgather_partitions": true, - "allgather_bucket_size": 2e8, - "overlap_comm": true, - "reduce_scatter": true, - "reduce_bucket_size": 2e8, - "contiguous_gradients": true, - "cpu_offload": true - }, - "optimizer": { "type": "AdamW", "params": { - "lr": 3e-5, - "betas": [0.8, 0.999], - "eps": 1e-8, - "weight_decay": 3e-7 + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" } }, "scheduler": { "type": "WarmupLR", "params": { - "warmup_min_lr": 0, - "warmup_max_lr": 3e-5, - "warmup_num_steps": 500 + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto" } }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "cpu", + "pin_memory": true + }, + "offload_param": { + "device": "cpu", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1e14, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9, + "stage3_gather_fp16_weights_on_model_save": true + }, + + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", "wall_clock_breakdown": false } EOT @@ -725,7 +745,7 @@ or with ``%%bash`` magic, where you can write a multi-line code for the shell pr In such case you don't need any of the code presented at the beginning of this section. -Note: ``%%bash`` magic is neat, but currently it buffers the output so you won't see the logs until the process +Note: While ``%%bash`` magic is neat, but currently it buffers the output so you won't see the logs until the process completes. @@ -760,48 +780,55 @@ When using DeepSpeed you always need to supply a DeepSpeed configuration file, y to be configured via the command line. You will find the nuances in the rest of this guide. To get an idea of what DeepSpeed configuration file looks like, here is one that activates ZeRO stage 2 features, -enables FP16, uses ``AdamW`` optimizer and ``WarmupLR`` scheduler: +including optimizer states cpu offload, uses ``AdamW`` optimizer and ``WarmupLR`` scheduler and will enable mixed +precision training if ``--fp16`` is passed: .. code-block:: json { "fp16": { - "enabled": true, + "enabled": "auto", "loss_scale": 0, "loss_scale_window": 1000, + "initial_scale_power": 16, "hysteresis": 2, "min_loss_scale": 1 }, - "zero_optimization": { - "stage": 2, - "allgather_partitions": true, - "allgather_bucket_size": 5e8, - "overlap_comm": true, - "reduce_scatter": true, - "reduce_bucket_size": 5e8, - "contiguous_gradients": true, - "cpu_offload": true - }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" + } + }, - "optimizer": { - "type": "AdamW", - "params": { - "lr": 3e-5, - "betas": [ 0.8, 0.999 ], - "eps": 1e-8, - "weight_decay": 3e-7 - } - }, + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto" + } + }, - "scheduler": { - "type": "WarmupLR", - "params": { - "warmup_min_lr": 0, - "warmup_max_lr": 3e-5, - "warmup_num_steps": 500 - } - } + "zero_optimization": { + "stage": 2, + "allgather_partitions": true, + "allgather_bucket_size": 2e8, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 2e8, + "contiguous_gradients": true, + "cpu_offload": true + }, + + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", } When you execute the program, DeepSpeed will log the configuration it received from the :class:`~transformers.Trainer` @@ -835,35 +862,38 @@ or: Shared Configuration ======================================================================================================================= -Some configuration information is required by both the :class:`~transformers.Trainer` and DeepSpeed to function -correctly, therefore, to prevent conflicting definitions, which could lead to hard to detect errors, we chose to -configure those via the :class:`~transformers.Trainer` command line arguments. -Therefore, the following DeepSpeed configuration params shouldn't be used with the :class:`~transformers.Trainer`: +.. warning:: -* ``train_batch_size`` -* ``train_micro_batch_size_per_gpu`` -* ``gradient_accumulation_steps`` + This section is a must-read -as these will be automatically derived from the run time environment and the following 2 command line arguments: +Some configuration values are required by both the :class:`~transformers.Trainer` and DeepSpeed to function correctly, +therefore, to prevent conflicting definitions, which could lead to hard to detect errors, we chose to configure those +via the :class:`~transformers.Trainer` command line arguments. -.. code-block:: bash +Additionally, some configuration values are derived automatically based on the model's configuration, so instead of +remembering to manually adjust multiple values, it's the best to let the :class:`~transformers.Trainer` do the majority +of configuration for you. - --per_device_train_batch_size 8 --gradient_accumulation_steps 2 +Therefore, in the rest of this guide you will find a special configuration value: ``auto``, which when set will be +automatically replaced with the correct or most efficient value. Please feel free to choose to ignore this +recommendation and set the values explicitly, in which case be very careful that your the +:class:`~transformers.Trainer` arguments and DeepSpeed configurations agree. For example, are you using the same +learning rate, or batch size, or gradient accumulation settings? if these mismatch the training may fail in very +difficult to detect ways. You have been warned. -which are always required to be supplied. - -Of course, you will need to adjust the values in this example to your situation. +There are multiple other values that are specific to DeepSpeed-only and those you will have to set manually to suit +your needs. ZeRO ======================================================================================================================= -`Zero Redundancy Optimizer (ZeRO) `__ is the work horse of DeepSpeed. It +`Zero Redundancy Optimizer (ZeRO) `__ is the workhorse of DeepSpeed. It support 3 different levels (stages) of optimization. The first one is not quite interesting for scalability purposes, -therefore this document focuses on stages 2 and 3. You will find more indepth information in the DeepSpeed -documentation. +therefore this document focuses on stages 2 and 3. Stage 3 is further improved by the latest addition of ZeRO-Infinity. +You will find more indepth information in the DeepSpeed documentation. The ``zero_optimization`` section of the configuration file is the most important part (`docs `__), since that is where you define @@ -916,36 +946,43 @@ ZeRO-3 Config The following is an example configuration for ZeRO stage 3: - .. code-block:: json { "zero_optimization": { "stage": 3, - "cpu_offload": true, - "cpu_offload_params": true, - "cpu_offload_use_pin_memory" : true, + "offload_optimizer": { + "device": "cpu", + "pin_memory": true + }, + "offload_param": { + "device": "cpu", + "pin_memory": true + }, "overlap_comm": true, "contiguous_gradients": true, "sub_group_size": 1e14, - "reduce_bucket_size": 1e6, - "stage3_prefetch_bucket_size": 0.94e6, - "stage3_param_persistence_threshold": 1e4, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", "stage3_max_live_parameters": 1e9, "stage3_max_reuse_distance": 1e9, "stage3_gather_fp16_weights_on_model_save": true } } -Note: if you're migrating from ZeRO-2 configuration that: ``allgather_partitions``, ``allgather_bucket_size`` and -``reduce_scatter`` configuration parameters are not used in ZeRO-3. If you keep these they will just be ignored. +If you are getting OOMs, because your model or activations don't fit into the GPU memory and you have unutilized CPU +memory offloading the optimizer states and parameters to CPU memory with ``"device": "cpu"`` may solve this limitation. +If you don't want to offload to CPU memory, use ``none`` instead of ``cpu`` for the ``device`` entry. Offloading to +NVMe is discussed further down. + +Pinned memory is enabled with ``pin_memory`` set to ``true``. This feature can improve the throughput at the cost of +making less memory available to other processes. Pinned memory is set aside to the specific process that requested it +and its typically accessed much faster than normal CPU memory. **Performance tuning:** - ``sub_group_size``: ``1e14`` -- ``reduce_bucket_size``: ``hidden_size*hidden_size`` -- ``stage3_prefetch_bucket_size``: ``0.9 * hidden_size * hidden_size`` -- ``stage3_param_persistence_threshold``: ``10 * hidden_size`` - ``stage3_max_live_parameters``: ``1e9`` - ``stage3_max_reuse_distance``: ``1e9`` @@ -960,37 +997,91 @@ going to be used again in near future (less than ``stage3_max_reuse_distance``) overhead. This is super helpful when you have activation checkpointing enabled, where we do a forward recompute and backward passes a a single layer granularity and want to keep the parameter in the forward recompute till the backward -If you set ``reduce_bucket_size``, ``stage3_prefetch_bucket_size`` and ``stage3_param_persistence_threshold`` as -recommended above, they will already be fairly small so you won't have to tune those much. +The following configuration values depend on the model's hidden size: -Since ``hidden_size`` varies from model to model, the ``Trainer`` will automatically set the needed value for the 3 -config parameters that contain that variable (using ``model.config.hidden_size``). Just set these values to ``0`` as -shown below and the right configuration will be passed to DeepSpeed: +- ``reduce_bucket_size``: ``hidden_size*hidden_size`` +- ``stage3_prefetch_bucket_size``: ``0.9 * hidden_size * hidden_size`` +- ``stage3_param_persistence_threshold``: ``10 * hidden_size`` + +therefore set these values to ``auto`` and the :class:`~transformers.Trainer` will automatically assign the recommended +values. But, of course, feel free to set these explicitly as well. + +``stage3_gather_fp16_weights_on_model_save`` enables model fp16 weights consolidation when model gets saved. With large +models and multiple GPUs this is an expensive operation both in terms of memory and speed. It's currently required if +you plan to resume the training. Watch out for future updates that will remove this limitation and make things more +flexible. + +If you're migrating from ZeRO-2 configuration note that ``allgather_partitions``, ``allgather_bucket_size`` and +``reduce_scatter`` configuration parameters are not used in ZeRO-3. If you keep these in the config file they will just +be ignored. Make sure to remove ``cpu_offload`` though, since it has been deprecated in ZeRO-3. + + + + +NVMe Support +======================================================================================================================= + +ZeRO-Infinity allows for training incredibly large models by extending GPU and CPU memory with NVMe memory. Thanks to +smart partitioning and tiling algorithms each GPU needs to send and receive very small amounts of data during +offloading so modern NVMe proved to be fit to allow for an even larger total memory pool available to your training +process. ZeRO-Infinity requires ZeRO-3 enabled. + +The following configuration example enables NVMe to offload both optimizer states and the params: .. code-block:: json { "zero_optimization": { "stage": 3, - "cpu_offload": true, - "cpu_offload_params": true, - "cpu_offload_use_pin_memory" : true, + "offload_optimizer": { + "device": "nvme", + "nvme_path": "/local_nvme", + "pin_memory": true, + "buffer_count": 4, + "fast_init": false + }, + "offload_param": { + "device": "nvme", + "nvme_path": "/local_nvme", + "pin_memory": true, + "buffer_count": 5, + "buffer_size": 1e8, + "max_in_cpu": 1e9 + } + "aio": { + "block_size": 262144, + "queue_depth": 32, + "thread_count": 1, + "single_submit": false, + "overlap_events": true + } "overlap_comm": true, "contiguous_gradients": true, "sub_group_size": 1e14, - "reduce_bucket_size": 0, - "stage3_prefetch_bucket_size": 0, - "stage3_param_persistence_threshold": 0, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", "stage3_max_live_parameters": 1e9, "stage3_max_reuse_distance": 1e9, "stage3_gather_fp16_weights_on_model_save": true - } + }, } -``stage3_gather_fp16_weights_on_model_save`` enables model fp16 weights consolidation when model gets saved. With large -models and multiple GPUs this is an expensive operation both in terms of memory and speed. It's currently required if -you plan to resume the training. Watch out for future updates that will remove this limitation and make things more -flexible. +You can choose to offload both optimizer states and params to NVMe, or just one of them or none. For example, if you +have copious amounts of CPU memory available, by all means offload to CPU memory only as it'd be faster (hint: +`"device": "cpu"`). + +Here is the full documentation for offloading `optimizer states +`__ and `parameters +`__. + +Make sure that your ``nvme_path`` is actually an NVMe, since it will work with the normal hard drive or SSD, but it'll +be much much slower. The fast scalable training was designed with modern NVMe transfer speeds in mind (as of this +writing one can have ~3.5GB/s read, ~3GB/s write peak speeds). + +In order to figure out the optimal ``aio`` configuration block you must run a benchmark on your target setup, as +`explained here `__. + ZeRO-2 vs ZeRO-3 Performance @@ -1016,13 +1107,13 @@ these help you to trade scalability for speed depending on your needs. ZeRO-2 Example +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -Here is a full ZeRO-2 all-enabled configuration file ``ds_config_zero2.json``: +Here is a full ZeRO-2 auto-configuration file ``ds_config_zero2.json``: .. code-block:: json { "fp16": { - "enabled": true, + "enabled": "auto", "loss_scale": 0, "loss_scale_window": 1000, "initial_scale_power": 16, @@ -1030,6 +1121,25 @@ Here is a full ZeRO-2 all-enabled configuration file ``ds_config_zero2.json``: "min_loss_scale": 1 }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" + } + }, + + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto" + } + }, + "zero_optimization": { "stage": 2, "allgather_partitions": true, @@ -1041,35 +1151,17 @@ Here is a full ZeRO-2 all-enabled configuration file ``ds_config_zero2.json``: "cpu_offload": true }, - "optimizer": { - "type": "AdamW", - "params": { - "lr": 3e-5, - "betas": [0.8, 0.999], - "eps": 1e-8, - "weight_decay": 3e-7 - } - }, - - "scheduler": { - "type": "WarmupLR", - "params": { - "warmup_min_lr": 0, - "warmup_max_lr": 3e-5, - "warmup_num_steps": 500 - } - }, - + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", "wall_clock_breakdown": false } - -ZeRO-3 Example -+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - -Here is a full ZeRO-3 all-enabled configuration file ``ds_config_zero3.json``: +Here is a full ZeRO-2 all-enabled manually set configuration file. It is here mainly for you to see what the typical +values look like, but we highly recommend using the one with multiple ``auto`` settings in it. .. code-block:: json @@ -1083,22 +1175,123 @@ Here is a full ZeRO-3 all-enabled configuration file ``ds_config_zero3.json``: "min_loss_scale": 1 }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": 3e-5, + "betas": [0.8, 0.999], + "eps": 1e-8, + "weight_decay": 3e-7 + } + }, + + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": 0, + "warmup_max_lr": 3e-5, + "warmup_num_steps": 500 + } + }, + + "zero_optimization": { + "stage": 2, + "allgather_partitions": true, + "allgather_bucket_size": 2e8, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 2e8, + "contiguous_gradients": true, + "cpu_offload": true + }, + + "steps_per_print": 2000, + "wall_clock_breakdown": false + } + + + +ZeRO-3 Example ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +Here is a full ZeRO-3 auto-configuration file ``ds_config_zero3.json``: + + +.. code-block:: json + + { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" + } + }, + + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto" + } + }, + "zero_optimization": { "stage": 3, - "cpu_offload": true, - "cpu_offload_params": true, - "cpu_offload_use_pin_memory" : true, + "offload_optimizer": { + "device": "cpu", + "pin_memory": true + }, + "offload_param": { + "device": "cpu", + "pin_memory": true + }, "overlap_comm": true, "contiguous_gradients": true, "sub_group_size": 1e14, - "reduce_bucket_size": 1e6, - "stage3_prefetch_bucket_size": 0.94e6, - "stage3_param_persistence_threshold": 1e4, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", "stage3_max_live_parameters": 1e9, "stage3_max_reuse_distance": 1e9, "stage3_gather_fp16_weights_on_model_save": true }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + } + +Here is a full ZeRO-3 all-enabled manually set configuration file. It is here mainly for you to see what the typical +values look like, but we highly recommend using the one with multiple ``auto`` settings in it. + +.. code-block:: json + + { + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "optimizer": { "type": "AdamW", "params": { @@ -1118,6 +1311,27 @@ Here is a full ZeRO-3 all-enabled configuration file ``ds_config_zero3.json``: } }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "cpu", + "pin_memory": true + }, + "offload_param": { + "device": "cpu", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1e14, + "reduce_bucket_size": 1e6, + "stage3_prefetch_bucket_size": 0.94e6, + "stage3_param_persistence_threshold": 1e4, + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9, + "stage3_gather_fp16_weights_on_model_save": true + }, + "steps_per_print": 2000, "wall_clock_breakdown": false } @@ -1153,7 +1367,35 @@ If you don't configure the ``optimizer`` entry in the configuration file, the :c automatically set it to ``AdamW`` and will use the supplied values or the defaults for the following command line arguments: ``--learning_rate``, ``--adam_beta1``, ``--adam_beta2``, ``--adam_epsilon`` and ``--weight_decay``. -Here is an example of the pre-configured ``optimizer`` entry for ``AdamW``: +Here is an example of the auto-configured ``optimizer`` entry for ``AdamW``: + +.. code-block:: json + + { + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" + } + } + } + + +Note that the command line arguments will set the values in the configuration file. This is so that there is one +definitive source of the values and to avoid hard to find errors when for example, the learning rate is set to +different values in different places. Command line rules. The values that get overridden are: + +- ``lr`` with the value of ``--learning_rate`` +- ``betas`` with the value of ``--adam_beta1 --adam_beta2`` +- ``eps`` with the value of ``--adam_epsilon`` +- ``weight_decay`` with the value of ``--weight_decay`` + +Therefore please remember to tune the shared hyperparameters on the command line. + +You can also set the values explicitly: .. code-block:: json @@ -1166,33 +1408,29 @@ Here is an example of the pre-configured ``optimizer`` entry for ``AdamW``: "eps": 1e-8, "weight_decay": 3e-7 } - } + } } -Note that the command line arguments will override the values in the configuration file. This is so that there is one -definitive source of the values and to avoid hard to find errors when for example, the learning rate is set to -different values in different places. Command line rules. The values that get overridden are: +But then you're on your own synchronizing the :class:`~transformers.Trainer` command line arguments and the DeepSpeed +configuration. -- ``lr`` with the value of ``--learning_rate`` -- ``betas`` with the value of ``--adam_beta1 --adam_beta2`` -- ``eps`` with the value of ``--adam_epsilon`` -- ``weight_decay`` with the value of ``--weight_decay`` +If you want to use another optimizer which is not listed above, you will have to add to the top level configuration. -Therefore please remember to tune the shared hyperparameters on the command line. +.. code-block:: json -If you want to use another optimizer which is not listed above, you will have to add ``"zero_allow_untested_optimizer": -true`` to the top level configuration. + { + "zero_allow_untested_optimizer": true + } -If you want to use one of the officially supported optimizers, configure them explicitly in the configuration file, and -make sure to adjust the values. e.g. if use Adam you will want ``weight_decay`` around ``0.01``. +Similarly to ``AdamW``, you can configure other officially supported optimizers. Just remember that may have different +config values. e.g. for Adam you will want ``weight_decay`` around ``0.01``. Scheduler +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -DeepSpeed supports LRRangeTest, OneCycle, WarmupLR and WarmupDecayLR LR schedulers. The full documentation is `here -`__. - +DeepSpeed supports ``LRRangeTest``, ``OneCycle``, ``WarmupLR`` and ``WarmupDecayLR`` learning rate schedulers. The full +documentation is `here `__. Here is where the schedulers overlap between 🤗 Transformers and DeepSpeed: @@ -1200,12 +1438,37 @@ Here is where the schedulers overlap between 🤗 Transformers and DeepSpeed: * ``WarmupDecayLR`` via ``--lr_scheduler_type linear``. This is also the default value for ``--lr_scheduler_type``, therefore, if you don't configure the scheduler this is scheduler that will get configured by default. - If you don't configure the ``scheduler`` entry in the configuration file, the :class:`~transformers.Trainer` will use the values of ``--lr_scheduler_type``, ``--learning_rate`` and ``--warmup_steps`` to configure a 🤗 Transformers version of it. -Here is an example of the pre-configured ``scheduler`` entry for ``WarmupLR``: +Here is an example of the auto-configured ``scheduler`` entry for ``WarmupLR``: + +.. code-block:: json + + { + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto" + } + } + } + +Since `"auto"` is used the :class:`~transformers.Trainer` arguments will set the correct values in the configuration +file. This is so that there is one definitive source of the values and to avoid hard to find errors when, for example, +the learning rate is set to different values in different places. Command line rules. The values that get set are: + +- ``warmup_min_lr`` with the value of ``0`` +- ``warmup_max_lr`` with the value of ``--learning_rate`` +- ``warmup_num_steps`` with the value of ``--warmup_steps`` +- ``total_num_steps`` with either the value of ``--max_steps`` or if it is not provided, derived automatically at run + time based on the environment and the size of the dataset and other command line arguments (needed for + ``WarmupDecayLR``). + +You can, of course, take over any or all of the configuration values and set those yourself: .. code-block:: json @@ -1220,17 +1483,8 @@ Here is an example of the pre-configured ``scheduler`` entry for ``WarmupLR``: } } -Note that the command line arguments will override the values in the configuration file. This is so that there is one -definitive source of the values and to avoid hard to find errors when for example, the learning rate is set to -different values in different places. Command line rules. The values that get overridden are: - -- ``warmup_max_lr`` with the value of ``--learning_rate`` -- ``warmup_num_steps`` with the value of ``--warmup_steps`` -- ``total_num_steps`` with either the value of ``--max_steps`` or if it is not provided, derived automatically at run - time based on the environment and the size of the dataset and other command line arguments (needed for - ``WarmupDecayLR``). - -Therefore please remember to tune the shared hyperparameters on the command line. +But then you're on your own synchronizing the :class:`~transformers.Trainer` command line arguments and the DeepSpeed +configuration. For example, for ``WarmupDecayLR``, you can use the following entry: @@ -1240,16 +1494,16 @@ For example, for ``WarmupDecayLR``, you can use the following entry: "scheduler": { "type": "WarmupDecayLR", "params": { - "total_num_steps": 10, "last_batch_iteration": -1, - "warmup_min_lr": 0, - "warmup_max_lr": 0.001, - "warmup_num_steps": 1000 + "total_num_steps": "auto", + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto" } } } -and ``warmup_max_lr``, ``warmup_num_steps`` and ``total_num_steps`` will be corrected at loading time. +and ``total_num_steps`, ``warmup_max_lr``, ``warmup_num_steps`` and ``total_num_steps`` will be set at loading time. @@ -1258,10 +1512,32 @@ Automatic Mixed Precision You can use automatic mixed precision with either a pytorch-like AMP way or the apex-like way: -If you want to use an equivalent of the Pytorch native amp, you can either configure the ``fp16`` entry in the -configuration file, or use the following command line arguments: ``--fp16 --fp16_backend amp``. +To configure pytorch AMP-like mode set: -Here is an example of the ``fp16`` configuration: +.. code-block:: json + + { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + } + } + +and the :class:`~transformers.Trainer` will automatically enable or disable it based on the value of +``args.fp16_backend``. The rest of config values are up to you. + +This mode gets enabled when ``--fp16 --fp16_backend amp`` command line args are passed. + +.. note:: + + At the moment DeepSpeed doesn't supported fp32 mode, though it will become available soon. Until then it will be + always set to ``true``. + +You can also enable/disable this mode explicitly: .. code-block:: json @@ -1270,17 +1546,32 @@ Here is an example of the ``fp16`` configuration: "enabled": true, "loss_scale": 0, "loss_scale_window": 1000, + "initial_scale_power": 16, "hysteresis": 2, "min_loss_scale": 1 - }, + } } +But then you're on your own synchronizing the :class:`~transformers.Trainer` command line arguments and the DeepSpeed +configuration. + Here is the `documentation `__. -If you want to use NVIDIA's apex instead, you can can either configure the ``amp`` entry in the configuration file, or -use the following command line arguments: ``--fp16 --fp16_backend apex --fp16_opt_level 01``. +To configure apex AMP-like mode set: -Here is an example of the ``amp`` configuration: +.. code-block:: json + + "amp": { + "enabled": "auto", + "opt_level": "auto" + } + +and the :class:`~transformers.Trainer` will automatically configure it based on the values of ``args.fp16_backend`` and +``args.fp16_opt_level``. + +This mode gets enabled when ``--fp16 --fp16_backend apex --fp16_opt_level 01`` command line args are passed. + +You can also configure this mode explicitly: .. code-block:: json @@ -1291,6 +1582,9 @@ Here is an example of the ``amp`` configuration: } } +But then you're on your own synchronizing the :class:`~transformers.Trainer` command line arguments and the DeepSpeed +configuration. + Here is the `documentation `__. @@ -1298,43 +1592,55 @@ Here is the `documentation Gradient Accumulation ======================================================================================================================= -While normally DeepSpeed gets gradient accumulation configured with: +To configure gradient accumulation set: .. code-block:: json { - "gradient_accumulation_steps": 3, + "gradient_accumulation_steps": "auto" } -in this case, to enable gradient accumulation, pass the command line ``--gradient_accumulation_steps 3`` argument as -normal and it will get injected into the DeepSpeed configuration. - -If you try to add it directly to the configuration file, you will receive an error from the ``Trainer`` - this is -because this setting is needed by the ``Trainer`` too, and so this approach ensures that there is a single way of -setting this value and thus avoid potential subtle errors. +and the :class:`~transformers.Trainer` will automatically set it to the value of ``args.gradient_accumulation_steps``. +You can also set the value explicitly: +.. code-block:: json + { + "gradient_accumulation_steps": 3 + } +But then you're on your own synchronizing the :class:`~transformers.Trainer` command line arguments and the DeepSpeed +configuration. Gradient Clipping ======================================================================================================================= -If you don't configure the ``gradient_clipping`` entry in the configuration file, the :class:`~transformers.Trainer` -will use the value of the ``--max_grad_norm`` command line argument to set it. - -Here is an example of the ``gradient_clipping`` configuration: +To configure gradient gradient clipping set: .. code-block:: json { - "gradient_clipping": 1.0, + "gradient_clipping": "auto" } +and the :class:`~transformers.Trainer` will automatically set it to the value of ``args.max_grad_norm``. + +You can also set the value explicitly: + +.. code-block:: json + + { + "gradient_clipping": 1.0 + } + +But then you're on your own synchronizing the :class:`~transformers.Trainer` command line arguments and the DeepSpeed +configuration. -Getting the model weights out + +Getting The Model Weights Out ======================================================================================================================= As long as you continue training and resuming using DeepSpeed you don't need to worry about anything. DeepSpeed stores @@ -1352,6 +1658,16 @@ version of the weights. If this setting is ``False`` ``pytorch_model.bin`` won't DeepSpeed's ``state_dict`` contains a placeholder and not the real weights. If we were to save this ``state_dict`` it won't be possible to load it back. + +.. code-block:: json + + { + "zero_optimization": { + "stage3_gather_fp16_weights_on_model_save": true + } + } + + **FP32 Weights:** While the fp16 weights are fine for resuming training, if you finished finetuning your model and want to upload it to @@ -1398,44 +1714,18 @@ This is it. ``pytorch_model.bin`` will now contain the full fp32 model weights c Note: currently the script requires 2x general RAM of the final fp32 model weights. -ZeRO 3 Nuances + +ZeRO-3 and Infinity Nuances ======================================================================================================================= -ZeRO 3 is quite different from ZeRO 2 because of its param sharding feature. +ZeRO-3 is quite different from ZeRO-2 because of its param sharding feature. + +ZeRO-Infinity further extends ZeRO-3 to support NVMe memory and multiple other speed and scalability improvements. While all the efforts were made for things to just work without needing any special changes to your models, in certain circumstances you may find the following information to be needed. -Registering External Parameters -+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - -If layer A needs to access weights belonging to layer B, currently layer A needs to tell DeepSpeed about it. This is -done with the help of ``deepspeed.zero.register_external_parameter`` that needs to be called in ``A.__init__`` and can -be seen in the following example: - -.. code-block:: python - - class ModuleZ3(torch.nn.Module): - def __init__(self, *args): - super().__init__(self, *args) - self.layer1 = SomeLayer() - self.layer2 = OtherLayer() - deepspeed.zero.register_external_parameter(self, self.layer1.weight) - - def forward(self, input): - x = self.layer1(input) - # self.layer1.weight is needed in ModuleZ3.forward - y = self.layer2(x, self.layer1.weight) - return y - -In general ``transformers`` models don't use this style of referring to other layer's weights so most likely you won't -need to use it. - -For full details on this method please refer to `Registering External Parameters -`__. - - Constructing Massive Models +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ @@ -1455,18 +1745,20 @@ context manager (which is also a function decorator), like so: As you can see this gives you a randomly initialized model. If you want to use a pretrained model, ``model_class.from_pretrained`` will activate this feature as long as -``is_deepspeed_zero3_enabled()`` returns ``True``, which can be set manually via ``deepspeed_zero3_enable(True)``. -Therefore to enable this feature here is the required sequence: +``is_deepspeed_zero3_enabled()`` returns ``True``, which currently is setup by the +class:`~transformers.TrainingArguments` object if the passed DeepSpeed configuration file contains ZeRO-3 config +section. Thus you must create the :class:`~transformers.TrainingArguments` object **before** calling +``from_pretrained``. Here is an example of a possible sequence: .. code-block:: python - from transformers.integrations import deepspeed_zero3_enable - deepspeed_zero3_enable(True) - model = T5ForConditionalGeneration.from_pretrained("t5-small") + from transformers import AutoModel, Trainer, TrainingArguments + training_args = TrainingArguments(..., deepspeed=ds_config) + model = AutoModel.from_pretrained("t5-small") + trainer = Trainer(model=model, args=training_args, ...) -If you're using ``Trainer`` command line arguments which include ``--deepspeed ds_config.json`` with ZeRO-3 config -enabled, then you can skip ``deepspeed_zero3_enable(True)`` as it will try to discover whether it'll be run under -ZeRO-3 and ``from_pretrained`` will automatically activate this feature. +If you're using the official example scripts and your command line arguments include ``--deepspeed ds_config.json`` +with ZeRO-3 config enabled, then everything is already done for you, since this is how example scripts are written. Note: If the fp16 weights of the model can't fit onto the memory of a single GPU this feature must be used. @@ -1475,8 +1767,6 @@ For full details on this method and other related features please refer to `Cons - - Gathering Parameters +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ @@ -1501,8 +1791,6 @@ larger multi-dimensional shape, this means that the parameter is partitioned and - - Notes ======================================================================================================================= @@ -1514,6 +1802,7 @@ Notes with your own trainer, and you will have to adapt the latter according to `the DeepSpeed integration instructions `__. + Main DeepSpeed Resources ======================================================================================================================= @@ -1526,6 +1815,7 @@ Papers: - `ZeRO: Memory Optimizations Toward Training Trillion Parameter Models `__ - `ZeRO-Offload: Democratizing Billion-Scale Model Training `__ +- `ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning `__ Finally, please, remember that, HuggingFace :class:`~transformers.Trainer` only integrates DeepSpeed, therefore if you have any problems or questions with regards to DeepSpeed usage, please, file an issue with `DeepSpeed GitHub diff --git a/setup.py b/setup.py index 027484a619..b4d65585b8 100644 --- a/setup.py +++ b/setup.py @@ -90,7 +90,7 @@ _deps = [ "cookiecutter==1.7.2", "dataclasses", "datasets", - "deepspeed>=0.3.14", + "deepspeed>=0.3.15", "docutils==0.16.0", "fairscale>0.3", "faiss-cpu", diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index fe6c15e481..02c302755a 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -7,7 +7,7 @@ deps = { "cookiecutter": "cookiecutter==1.7.2", "dataclasses": "dataclasses", "datasets": "datasets", - "deepspeed": "deepspeed>=0.3.14", + "deepspeed": "deepspeed>=0.3.15", "docutils": "docutils==0.16.0", "fairscale": "fairscale>0.3", "faiss-cpu": "faiss-cpu", diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py index 9ab198cf14..a2d6743a1e 100644 --- a/src/transformers/integrations.py +++ b/src/transformers/integrations.py @@ -19,8 +19,8 @@ import io import json import numbers import os -import sys import tempfile +import weakref from copy import deepcopy from pathlib import Path @@ -269,74 +269,180 @@ def rewrite_logs(d): return new_d -_is_deepspeed_zero3_enabled = None +def _is_true(config, key): + if config is None: + return False + return bool(config.get(key)) + + +def _set_if_auto(config, key, val): + if config is None: + return + if config.get(key) == "auto": + config[key] = val + + +class DeepSpeedConfigHF: + """ + This object contains Deepspeed configuration and can be quickly queried for things like zero stage. + + We store a ``weakref`` of this object in the module's global to be able to access the config from areas where the + Trainer is not available (e.g. `from_pretrained` and `_get_resized_embeddings`). + + The ``DeepSpeedConfigHF`` object is meant to be created during ``TrainingArguments`` object creation and has the + same lifespan as the latter. + """ + + def __init__(self, args): + self.config = None + self.stage = 0 + self.offload = False + + dep_version_check("deepspeed") + + self.config_process(args) + + # set global weakref object + deepspeed_config_hf_set(self) + + def is_zero2(self): + return self.stage == 2 + + def is_zero3(self): + return self.stage == 3 + + def is_offload(self): + return self.offload + + def config_process(self, args): + """ + 1. load json if the ``args.deepspeed`` is a path + 2. replace any ``auto`` values in the config with the correct or recommended value + + This is done as early as possible, before model is created, to allow ``is_deepspeed_zero3_enabled`` query and + getting to the early deepspeed config object during ``zero.Init()`` which needs whether fp16 is enabled, dtype, + etc. + + """ + config_file_or_dict = args.deepspeed + if isinstance(config_file_or_dict, dict): + # Don't modify user's data should they want to reuse it (e.g. in tests), because once we + # modified it, it will not be accepted here again, since `auto` values would have been overriden + config = deepcopy(config_file_or_dict) + elif isinstance(config_file_or_dict, str): + with io.open(config_file_or_dict, "r", encoding="utf-8") as f: + config = json.load(f) + else: + raise ValueError("expecting either a path to a config file or a pre-populated dict") + + self.config = config + + # DeepSpeed does: + # train_batch_size = world_size * train_micro_batch_size_per_gpu * gradient_accumulation_steps + train_batch_size = args.world_size * args.per_device_train_batch_size * args.gradient_accumulation_steps + _set_if_auto(config, "train_micro_batch_size_per_gpu", args.per_device_train_batch_size) + _set_if_auto(config, "gradient_accumulation_steps", args.gradient_accumulation_steps) + _set_if_auto(config, "train_batch_size", train_batch_size) + _set_if_auto(config, "gradient_clipping", args.max_grad_norm) + + # zero + config_zero = config.get("zero_optimization", {}) + self.stage = config_zero.get("stage", 0) + + config_optim = config.get("optimizer", {}) + if config_optim != {}: + config_optim_params = config_optim.get("params") + _set_if_auto(config_optim_params, "lr", args.learning_rate) + _set_if_auto(config_optim_params, "betas", [args.adam_beta1, args.adam_beta2]) + _set_if_auto(config_optim_params, "eps", args.adam_epsilon) + _set_if_auto(config_optim_params, "weight_decay", args.weight_decay) + + config_sched = config.get("scheduler", {}) + if config_sched != {}: + config_sched_params = config_sched.get("params") + _set_if_auto(config_sched_params, "warmup_min_lr", 0) + _set_if_auto(config_sched_params, "warmup_max_lr", args.learning_rate) + _set_if_auto(config_sched_params, "warmup_num_steps", args.warmup_steps) + # total_num_steps - will get set in deepspeed_init + + # fp16 + if args.fp16: + fp16_backend = "apex" if args.fp16_backend == "apex" else "amp" + else: + fp16_backend = None + + # amp: similar to the pytorch native amp - it has a bunch of optional params but we won't set + # any here unless the user did the work + config_fp16 = config.get("fp16") + # XXX: at the moment fp16 can't be False, but the fp32 solution is in works - once it's PR'ed and + # merged and a new release is made, delete the next line and uncomment the one after it + _set_if_auto(config_fp16, "enabled", True) + # _set_if_auto(config_fp16, "enabled", fp16_backend == "amp") + + # apex: delegates amp work to apex (which needs to be available), but it cannot be used with any + # ZeRO features, so probably best to be avoided. + config_amp = config.get("amp") + _set_if_auto(config_amp, "enabled", fp16_backend == "apex") + _set_if_auto(config_amp, "opt_level", args.fp16_opt_level) + + config_zero = config.get("zero_optimization", {}) + if self.is_zero2(): + self.offload = _is_true(config_zero, "cpu_offload") + elif self.is_zero3(): + offload_devices = ["cpu", "nvme"] + if config_zero.get("offload_optimizer", {}).get("device") in offload_devices: + self.offload = True + if config_zero.get("offload_param", {}).get("device") in offload_devices: + self.offload = True + + def config_finalize(self, args, model, num_training_steps): + """ + This stage is run after we have the model and know num_training_steps. + + Now we we can complete the configuration process. + + """ + config = self.config + + # zero + config_zero = config.get("zero_optimization", {}) + if self.is_zero3(): + # automatically assign the optimal config values based on model config + hidden_size = model.config.hidden_size + _set_if_auto(config_zero, "reduce_bucket_size", hidden_size * hidden_size) + _set_if_auto(config_zero, "stage3_prefetch_bucket_size", 0.9 * hidden_size * hidden_size) + _set_if_auto(config_zero, "stage3_param_persistence_threshold", 10 * hidden_size) + + # scheduler + config_sched = config.get("scheduler", {}) + config_sched_params = config_sched.get("params", {}) + _set_if_auto(config_sched_params, "total_num_steps", num_training_steps) + + +# keep the config object global to be able to access it anywhere during TrainingArguments life-cycle +_deepspeed_config_hf_weak_ref = None + + +def deepspeed_config_hf_set(deepspeed_config_hf_obj): + # this is a special weakref global object to allow us to get to Deepspeed config from APIs + # that don't have an easy way to get to the Deepspeed config outside of the Trainer domain. + global _deepspeed_config_hf_weak_ref + # will go away automatically when DeepSpeedConfigHF is destroyed (when TrainingArguments is destroyed) + _deepspeed_config_hf_weak_ref = weakref.ref(deepspeed_config_hf_obj) def is_deepspeed_zero3_enabled(): - """ - This function answers to the question of whether DeepSpeed is going to be used and run using ZeRO Stage 3. - - It includes an auto-discovery method, see comments in the code for details. - - Returns: ``True`` if either it was explicitly enabled via ``deepspeed_zero3_enable(True)`` or the auto-detector was - able to derive that the ``Trainer`` will be running via DeepSpeed ZeRO stage 3. - """ - global _is_deepspeed_zero3_enabled - if _is_deepspeed_zero3_enabled is None: - _is_deepspeed_zero3_enabled = False - # Try to auto-discover if we are about to use DeepSpeed with ZeRO3 enabled. This will only - # work for scripts using cli to pass --deepspeed ds_config.json. If cmd args aren't used, - # then to get the model efficiently loaded across multiple-gpus one has to explicitly call - # is_deepspeed_zero3_enabled(True) **before** instantiating a model object - if "--deepspeed" in sys.argv: - idx = sys.argv.index("--deepspeed") - ds_config = sys.argv[idx + 1] - if not os.path.exists(ds_config): - raise ValueError("--deepspeed requires a valid path to a config file") - config = deepspeed_parse_config(ds_config) - if ( - "zero_optimization" in config - and "stage" in config["zero_optimization"] - and config["zero_optimization"]["stage"] == 3 - ): - _is_deepspeed_zero3_enabled = True - - return _is_deepspeed_zero3_enabled - - -def deepspeed_zero3_enable(enable=True): - """ - ``is_deepspeed_zero3_enabled()`` tries to derive automatically if DeepSpeed ZeRO 3 is going to be used by looking - at ``sys.argv`` which may or may contain information about where to find the DeepSpeed config if any. - - This function allows for explicit enabling/disabling of this global flag. - - Args: - enable: if set to ``True`` will make ``is_deepspeed_zero3_enabled()`` return ``True`` - """ - global _is_deepspeed_zero3_enabled - _is_deepspeed_zero3_enabled = enable - - -def deepspeed_parse_config(ds_config): - """ - If ``ds_config`` isn't already a dict, read it from the config file. - - If it's already a dict, return a copy of it, so that we can freely modify it. - """ - dep_version_check("deepspeed") - - if isinstance(ds_config, dict): - # Don't modify user's data should they want to reuse it (e.g. in tests), because once we - # modified it, it will not be accepted here again, since some config params must be not set by users - config = deepcopy(ds_config) - elif isinstance(ds_config, str): - with io.open(ds_config, "r", encoding="utf-8") as f: - config = json.load(f) + if _deepspeed_config_hf_weak_ref is not None and _deepspeed_config_hf_weak_ref() is not None: + return _deepspeed_config_hf_weak_ref().is_zero3() else: - raise ValueError("expecting either a path to a config file or a pre-populated dict") + return False - return config + +def deepspeed_config(): + if _deepspeed_config_hf_weak_ref is not None and _deepspeed_config_hf_weak_ref() is not None: + return _deepspeed_config_hf_weak_ref().config + else: + return None def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None): @@ -355,41 +461,16 @@ def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None): """ import deepspeed - args = trainer.args model = trainer.model - config = deepspeed_parse_config(args.deepspeed) + deepspeed_config_hf = trainer.args.deepspeed_config_hf + deepspeed_config_hf.config_finalize(trainer.args, model, num_training_steps) - # The following code translates relevant trainer's cl args into the DS config - - # First to ensure that there is no mismatch between cl args values and presets in the config - # file, ask to not set in ds config file: - # - "train_batch_size", - # - "train_micro_batch_size_per_gpu", - # - "gradient_accumulation_steps" - bs_keys = ["train_batch_size", "train_micro_batch_size_per_gpu"] - if len([x for x in bs_keys if x in config.keys()]): - raise ValueError( - f"Do not include {bs_keys} entries in the ds config file, as they will be set via --per_device_train_batch_size or its default" - ) - if "gradient_accumulation_steps" in config.keys(): - raise ValueError( - "Do not include gradient_accumulation_steps entries in the ds config file, as they will be set via --gradient_accumulation_steps or its default" - ) - - # DeepSpeed does: - # train_batch_size = n_gpus * train_micro_batch_size_per_gpu * gradient_accumulation_steps - # therefore we just need to set: - config["train_micro_batch_size_per_gpu"] = args.per_device_train_batch_size - config["gradient_accumulation_steps"] = args.gradient_accumulation_steps - - if "gradient_clipping" in config: - logger.info("Keeping the `gradient_clipping` config intact, ignoring any gradient clipping-specific cl args") - else: # override only if the ds config doesn't already have this section - config["gradient_clipping"] = args.max_grad_norm + # resume config update - some bits like `model` and `num_training_steps` only become available during train + config = deepspeed_config_hf.config # Optimizer + Scheduler - # Currently support combos: + # Currently supported combos: # 1. DS scheduler + DS optimizer: Yes # 2. HF scheduler + HF optimizer: Yes # 3. DS scheduler + HF optimizer: Yes @@ -402,36 +483,16 @@ def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None): # 4. HF scheduler + DS optimizer: No optimizer = None - if "optimizer" in config: - logger.info("Updating the `scheduler` config with other command line arguments") - - # to avoid inconsistent values of lr and warm up steps the command line args override config - params = dict( - lr=args.learning_rate, - betas=[args.adam_beta1, args.adam_beta2], - eps=args.adam_epsilon, - weight_decay=args.weight_decay, - ) - for k, v in params.items(): - if k in config["optimizer"]["params"]: - logger.info(f"setting optimizer.params.{k} to {v}") - config["optimizer"]["params"][k] = v - - else: # override only if the ds config doesn't already have this section - if ( - "zero_optimization" in config - and "cpu_offload" in config["zero_optimization"] - and config["zero_optimization"]["cpu_offload"] is True - ): + if "optimizer" not in config: + if deepspeed_config_hf.is_offload(): raise ValueError("ZeRO Offload can only work with DeepSpeed optimizers") - else: - # ds supports Adam, OneBitAdam, and Lamb optimizers and can import other optimizers from torch. - # But trainer uses AdamW by default. - # To use other optimizers so using a different scheduler requires voiding warranty with: `zero_allow_untested_optimizer` - trainer.create_optimizer() - optimizer = trainer.optimizer - # flag that this is non-native optimizer - config["zero_allow_untested_optimizer"] = True + + # ds supports Adam, OneBitAdam, and Lamb optimizers and can import other optimizers from torch. + # But trainer uses AdamW by default. + trainer.create_optimizer() + optimizer = trainer.optimizer + # To use other optimizers requires voiding warranty with: `zero_allow_untested_optimizer` + config["zero_allow_untested_optimizer"] = True # DS schedulers (deepspeed/runtime/lr_schedules.py): # @@ -442,25 +503,7 @@ def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None): # WarmupLR | constant_with_warmup | get_constant_schedule_with_warmup | w/ warmup_min_lr=0 # WarmupDecayLR| linear | get_linear_schedule_with_warmup | lr_scheduler = None - if "scheduler" in config: - logger.info("Updating the `scheduler` config with other command line arguments") - # the user won't easily know the correct num_training_steps should they use WarmupDecayLR, - # so let's set it to the correct value - if config["scheduler"]["type"] == "WarmupDecayLR": - logger.info(f"setting scheduler.params.total_num_steps to {num_training_steps}") - config["scheduler"]["params"]["total_num_steps"] = num_training_steps - - # to avoid inconsistent values of lr and warmup steps the command line args override config - params = dict( - warmup_max_lr=args.learning_rate, - warmup_num_steps=args.warmup_steps, - ) - for k, v in params.items(): - if k in config["scheduler"]["params"]: - logger.info(f"setting scheduler.params.{k} to {v}") - config["scheduler"]["params"][k] = v - - else: # override only if the ds config doesn't already have this section + if "scheduler" not in config: if "optimizer" in config: # to make this option work, we need to init DS optimizer first, then init HS scheduler, # then pass the HS scheduler to DS init, which is not possible at the moment @@ -469,43 +512,6 @@ def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None): trainer.create_scheduler(num_training_steps=num_training_steps) lr_scheduler = trainer.lr_scheduler - # fp16 - if trainer.fp16_backend is not None: - # Deepspeed has 2 possible fp16 config entries: - # - `fp16`: for the native amp - it has a bunch of optional params but we won't set any here unless the user did the work - # - `amp`: which delegates amp work to apex (which needs to be available), but it cannot be used with any ZeRO features, so probably best to be avoided. - if trainer.fp16_backend == "apex": - if "amp" in config: - logger.info("Keeping the `amp` config intact, ignoring any amp-specific cl args") - else: - config["amp"] = { - "enabled": True, - "opt_level": args.fp16_opt_level, - } - elif trainer.fp16_backend == "amp": - if "fp16" in config: - logger.info("Keeping the `fp16` config intact, ignoring any fp16-specific cl args") - else: - config["fp16"] = { - "enabled": True, - } - - # zero - if "zero_optimization" in config: - zero = config["zero_optimization"] - - # now we know for sure if zero3 is enabled - deepspeed_zero3_enable(zero.get("stage") == 3) - - # automatically assign the optimal config values based on model config - hidden_size = model.config.hidden_size - if zero.get("reduce_bucket_size") == 0: - zero["reduce_bucket_size"] = hidden_size * hidden_size - if zero.get("stage3_prefetch_bucket_size") == 0: - zero["stage3_prefetch_bucket_size"] = 0.9 * hidden_size * hidden_size - if zero.get("stage3_param_persistence_threshold") == 0: - zero["stage3_param_persistence_threshold"] = 10 * hidden_size - # keep for quick debug: # from pprint import pprint; pprint(config) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 103b9a906d..7b1f477af5 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -1122,7 +1122,11 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix import deepspeed logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model") - # this immediately partitions the model to avoid the overhead in time and memory copying it on CPU or each GPU first + # this immediately partitions the model across all gpus, to avoid the overhead in time + # and memory copying it on CPU or each GPU first + + # XXX: param_dict will be added in deepspeed==0.3.16 and probably replaced by deepspeed_config + # with deepspeed.zero.Init(param_dict=deepspeed_config()): with deepspeed.zero.Init(): model = cls(config, *model_args, **model_kwargs) else: diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index a91921d466..30e433cfc7 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -70,9 +70,6 @@ class TrainingArguments: `__ arguments that can be specified on the command line. - - - Parameters: output_dir (:obj:`str`): The output directory where the model predictions and checkpoints will be written. @@ -625,6 +622,14 @@ class TrainingArguments: elif ShardedDDPOption.ZERO_DP_2 in self.sharded_ddp and ShardedDDPOption.ZERO_DP_3 in self.sharded_ddp: raise ValueError("`--sharded_ddp zero_dp_2` is not compatible with `--sharded_ddp zero_dp_3`.") + if self.deepspeed: + # - must be run very last in arg parsing, since it will use a lot of these settings. + # - must be run before the model is created. + from transformers.integrations import DeepSpeedConfigHF + + # will be used later by the Trainer (leave self.deepspeed unmodified in case a user relies on it not to be modified) + self.deepspeed_config_hf = DeepSpeedConfigHF(self) + def __repr__(self): # We override the default repr to remove deprecated arguments from the repr. This method should be removed once # those deprecated arguments are removed form TrainingArguments. (TODO: v5) diff --git a/tests/deepspeed/ds_config_zero2.json b/tests/deepspeed/ds_config_zero2.json index a516f33125..ef180edd1e 100644 --- a/tests/deepspeed/ds_config_zero2.json +++ b/tests/deepspeed/ds_config_zero2.json @@ -1,6 +1,6 @@ { "fp16": { - "enabled": true, + "enabled": "auto", "loss_scale": 0, "loss_scale_window": 1000, "initial_scale_power": 16, @@ -8,6 +8,25 @@ "min_loss_scale": 1 }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" + } + }, + + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto" + } + }, + "zero_optimization": { "stage": 2, "allgather_partitions": true, @@ -19,25 +38,10 @@ "cpu_offload": true }, - "optimizer": { - "type": "AdamW", - "params": { - "lr": 3e-5, - "betas": [0.8, 0.999], - "eps": 1e-8, - "weight_decay": 3e-7 - } - }, - - "scheduler": { - "type": "WarmupLR", - "params": { - "warmup_min_lr": 0, - "warmup_max_lr": 3e-5, - "warmup_num_steps": 500 - } - }, - + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", "wall_clock_breakdown": false } diff --git a/tests/deepspeed/ds_config_zero3.json b/tests/deepspeed/ds_config_zero3.json index 0f90995952..6f7a80e9e4 100644 --- a/tests/deepspeed/ds_config_zero3.json +++ b/tests/deepspeed/ds_config_zero3.json @@ -1,6 +1,6 @@ { "fp16": { - "enabled": true, + "enabled": "auto", "loss_scale": 0, "loss_scale_window": 1000, "initial_scale_power": 16, @@ -8,41 +8,50 @@ "min_loss_scale": 1 }, - "zero_optimization": { - "stage": 3, - "cpu_offload": true, - "cpu_offload_params": true, - "cpu_offload_use_pin_memory" : true, - "overlap_comm": true, - "contiguous_gradients": true, - "sub_group_size": 1e14, - "reduce_bucket_size": 0, - "stage3_prefetch_bucket_size": 0, - "stage3_param_persistence_threshold": 0, - "stage3_max_live_parameters": 1e9, - "stage3_max_reuse_distance": 1e9, - "stage3_gather_fp16_weights_on_model_save": true - }, - "optimizer": { "type": "AdamW", "params": { - "lr": 3e-5, - "betas": [0.8, 0.999], - "eps": 1e-8, - "weight_decay": 3e-7 + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" } }, "scheduler": { "type": "WarmupLR", "params": { - "warmup_min_lr": 0, - "warmup_max_lr": 3e-5, - "warmup_num_steps": 500 + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto" } }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "cpu", + "pin_memory": true + }, + "offload_param": { + "device": "cpu", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1e14, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9, + "stage3_gather_fp16_weights_on_model_save": true + }, + + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", "wall_clock_breakdown": false } diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py index 2b00e75652..52f9bd72f1 100644 --- a/tests/deepspeed/test_deepspeed.py +++ b/tests/deepspeed/test_deepspeed.py @@ -42,7 +42,7 @@ with ExtendSysPath(f"{bindir}/.."): from test_trainer import TrainerIntegrationCommon # noqa if is_torch_available(): - from test_trainer import get_regression_trainer # noqa + from test_trainer import RegressionModelConfig, RegressionPreTrainedModel, get_regression_trainer # noqa set_seed(42) @@ -66,6 +66,10 @@ def require_deepspeed(test_case): return test_case +if is_deepspeed_available(): + from deepspeed.utils import logger as deepspeed_logger # noqa + from transformers.integrations import deepspeed_config, is_deepspeed_zero3_enabled # noqa + ZERO2 = "zero2" ZERO3 = "zero3" stages = [ZERO2, ZERO3] @@ -115,12 +119,6 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): with io.open(self.ds_config_file[ZERO3], "r", encoding="utf-8") as f: self.ds_config_dict[ZERO3] = json.load(f) - def tearDown(self): - # XXX: Fixme - this is a temporary band-aid since this global variable impacts other tests - import transformers - - transformers.integrations._is_deepspeed_zero3_enabled = None - def get_config_dict(self, stage): """As the tests modify the dict, always make a copy""" config = deepcopy(self.ds_config_dict[stage]) @@ -173,25 +171,65 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_zero2_dict) with self.assertRaises(Exception) as context: trainer.train() - self.assertTrue("HF scheduler + DeepSpeed optimizer combination is not possible" in str(context.exception)) + self.assertTrue( + "HF scheduler + DeepSpeed optimizer combination is not possible" in str(context.exception), + f"got exception: {context.exception}", + ) - def test_hf_optimizer_with_offload(self): - # must not allow non-DS optimizer when using ZERO-offload + def test_stage3_nvme_offload(self): with mockenv_context(**self.dist_env_1_gpu): - ds_config_zero2_dict = self.get_config_dict(ZERO2) - del ds_config_zero2_dict["optimizer"] # force default HF Trainer optimizer - ds_config_zero2_dict["zero_optimization"]["cpu_offload"] = True - # sanity check - should the default config change - assert ( - "cpu_offload" in ds_config_zero2_dict["zero_optimization"] - and ds_config_zero2_dict["zero_optimization"]["cpu_offload"] is True - ), "ensure the config is set up correctly" - trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_zero2_dict) - with self.assertRaises(Exception) as context: + # this actually doesn't have to be on NVMe, any storage will do since this test only + # runs a simple check that we can use some directory as if it were NVMe + nvme_path = self.get_auto_remove_tmp_dir() + nvme_config = dict(device="nvme", nvme_path=nvme_path) + ds_config_zero3_dict = self.get_config_dict(ZERO3) + ds_config_zero3_dict["zero_optimization"]["offload_optimizer"] = nvme_config + ds_config_zero3_dict["zero_optimization"]["offload_param"] = nvme_config + trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_zero3_dict) + with CaptureLogger(deepspeed_logger) as cs: trainer.train() - self.assertTrue("ZeRO Offload can only work with DeepSpeed optimizers" in str(context.exception)) + self.assertIn("DeepSpeed info", cs.out, "expected DeepSpeed logger output but got none") # --- These tests need to run on both zero stages --- # + + @parameterized.expand(stages) + def test_fp32(self, stage): + ds_config_dict = self.get_config_dict(stage) + ds_config_dict["fp16"]["enabled"] = False # force non-fp16 mode + + # XXX: do we go via from_pretrained in zero 3 here? need to test zero.Init(dtype=torch.float) + + # XXX: rewrite this test once fp32 is supported by DeepSpeed + with mockenv_context(**self.dist_env_1_gpu): + trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_dict) + with self.assertRaises(Exception) as context: + trainer.train() + self.assertIn( + "ZeRO is only supported if fp16 is enabled", + str(context.exception), + f"got exception: {context.exception}", + ) + + @parameterized.expand(stages) + def test_hf_optimizer_with_offload(self, stage): + # must not allow non-DS optimizer when using ZERO-offload + ds_config_dict = self.get_config_dict(stage) + del ds_config_dict["optimizer"] # force default HF Trainer optimizer + # force cpu offload + if stage == "stage2": + ds_config_dict["zero_optimization"]["cpu_offload"] = True + elif stage == "stage3": + ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu" + with mockenv_context(**self.dist_env_1_gpu): + trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_dict) + with self.assertRaises(Exception) as context: + trainer.train() + self.assertIn( + "ZeRO Offload can only work with DeepSpeed optimizers", + str(context.exception), + f"got exception: {context.exception}", + ) + @parameterized.expand(stages) def test_fake_notebook_no_launcher(self, stage): # this setup emulates a notebook where a launcher needs to be emulated by hand @@ -199,14 +237,12 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): # note that unittest resets sys.stdout each test, so `CaptureStd` will work here to capture # DeepSpeed log if this test happens to run first in this pytest worker. But it will fail if # it's run not as a first test as `sys.stdout` will no longer be the same. So we either have - # to reset `logger.handlers[0].setStream(sys.stdout)` or directly capture from the logger. - from deepspeed.utils import logger - - with CaptureLogger(logger) as cs: - with mockenv_context(**self.dist_env_1_gpu): - trainer = get_regression_trainer(local_rank=0, deepspeed=self.ds_config_file[stage]) + # to reset `deepspeed_logger.handlers[0].setStream(sys.stdout)` or directly capture from the deepspeed_logger. + with mockenv_context(**self.dist_env_1_gpu): + trainer = get_regression_trainer(local_rank=0, deepspeed=self.ds_config_file[stage]) + with CaptureLogger(deepspeed_logger) as cs: trainer.train() - assert "DeepSpeed info" in cs.out, "expected DeepSpeed logger output but got none" + self.assertIn("DeepSpeed info", cs.out, "expected DeepSpeed logger output but got none") @parameterized.expand(stages) def test_early_get_last_lr(self, stage): @@ -425,6 +461,38 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): self.assertEqual(b, b1) self.check_trainer_state_are_the_same(state, state1) + def test_config_object(self): + # test that we can switch from zero2 to zero3 in the same process for example + # test is_zero, etc. + output_dir = self.get_auto_remove_tmp_dir() + kwargs = dict(output_dir=output_dir, train_len=8) + + with mockenv_context(**self.dist_env_1_gpu): + ds_config_zero3_dict = self.get_config_dict("zero3") + ds_config_zero2_dict = self.get_config_dict("zero2") + + trainer = get_regression_trainer(deepspeed=ds_config_zero3_dict, **kwargs) + self.assertTrue(is_deepspeed_zero3_enabled()) + + # test we can repeat that and with train this time + trainer = get_regression_trainer(deepspeed=ds_config_zero3_dict, **kwargs) + trainer.train() + self.assertTrue(is_deepspeed_zero3_enabled()) + + # test zero3 is disabled + trainer = get_regression_trainer(deepspeed=ds_config_zero2_dict, **kwargs) + self.assertFalse(is_deepspeed_zero3_enabled()) + + # check config obj + config = deepspeed_config() + self.assertTrue(bool(config), "Deepspeed config should be accessible") + + del trainer + # now weakref should gc the global and we shouldn't get anything here + config = deepspeed_config() + self.assertFalse(is_deepspeed_zero3_enabled()) + self.assertFalse(bool(config), "Deepspeed config should not be accessible") + @slow @require_deepspeed @@ -557,6 +625,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus): --adafactor --source_lang en --target_lang ro + --report_to none """.split() args.extend(["--source_prefix", '"translate English to Romanian: "']) @@ -626,6 +695,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus): --num_train_epochs 1 --warmup_steps 8 --block_size 128 + --report_to none """.split() ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage}.json".split() diff --git a/tests/test_trainer.py b/tests/test_trainer.py index 8ebdf92805..68a15ae673 100644 --- a/tests/test_trainer.py +++ b/tests/test_trainer.py @@ -213,16 +213,21 @@ if is_torch_available(): label_names = kwargs.get("label_names", None) train_dataset = RegressionDataset(length=train_len, label_names=label_names) eval_dataset = RegressionDataset(length=eval_len, label_names=label_names) - if pretrained: - config = RegressionModelConfig(a=a, b=b, double_output=double_output) - model = RegressionPreTrainedModel(config) + + model_init = kwargs.pop("model_init", None) + if model_init is not None: + model = None else: - model = RegressionModel(a=a, b=b, double_output=double_output) + if pretrained: + config = RegressionModelConfig(a=a, b=b, double_output=double_output) + model = RegressionPreTrainedModel(config) + else: + model = RegressionModel(a=a, b=b, double_output=double_output) + compute_metrics = kwargs.pop("compute_metrics", None) data_collator = kwargs.pop("data_collator", None) optimizers = kwargs.pop("optimizers", (None, None)) output_dir = kwargs.pop("output_dir", "./regression") - model_init = kwargs.pop("model_init", None) args = RegressionTrainingArguments(output_dir, a=a, b=b, **kwargs) return Trainer(