AdamW is now supported by default (#9624)
This commit is contained in:
parent
fa35cda91e
commit
4c32f9f26e
|
@ -655,7 +655,6 @@ enables FP16, uses AdamW optimizer and WarmupLR scheduler:
|
|||
"weight_decay": 3e-7
|
||||
}
|
||||
},
|
||||
"zero_allow_untested_optimizer": true,
|
||||
|
||||
"scheduler": {
|
||||
"type": "WarmupLR",
|
||||
|
@ -766,8 +765,8 @@ Optimizer
|
|||
=======================================================================================================================
|
||||
|
||||
|
||||
DeepSpeed's main optimizers are Adam, OneBitAdam, and Lamb. These have been thoroughly tested with ZeRO and are thus
|
||||
recommended to be used. It, however, can import other optimizers from ``torch``. The full documentation is `here
|
||||
DeepSpeed's main optimizers are Adam, AdamW, OneBitAdam, and Lamb. These have been thoroughly tested with ZeRO and are
|
||||
thus recommended to be used. It, however, can import other optimizers from ``torch``. The full documentation is `here
|
||||
<https://www.deepspeed.ai/docs/config-json/#optimizer-parameters>`__.
|
||||
|
||||
If you don't configure the ``optimizer`` entry in the configuration file, the :class:`~transformers.Trainer` will
|
||||
|
@ -779,7 +778,6 @@ Here is an example of the pre-configured ``optimizer`` entry for AdamW:
|
|||
.. code-block:: json
|
||||
|
||||
{
|
||||
"zero_allow_untested_optimizer": true,
|
||||
"optimizer": {
|
||||
"type": "AdamW",
|
||||
"params": {
|
||||
|
@ -791,8 +789,8 @@ Here is an example of the pre-configured ``optimizer`` entry for AdamW:
|
|||
}
|
||||
}
|
||||
|
||||
Since AdamW isn't on the list of tested with DeepSpeed/ZeRO optimizers, we have to add
|
||||
``zero_allow_untested_optimizer`` flag.
|
||||
If you want to use another optimizer which is not listed above, you will have to add ``"zero_allow_untested_optimizer":
|
||||
true`` to the top level configuration.
|
||||
|
||||
If you want to use one of the officially supported optimizers, configure them explicitly in the configuration file, and
|
||||
make sure to adjust the values. e.g. if use Adam you will want ``weight_decay`` around ``0.01``.
|
||||
|
|
|
@ -19,8 +19,6 @@
|
|||
"cpu_offload": true
|
||||
},
|
||||
|
||||
"zero_allow_untested_optimizer": true,
|
||||
|
||||
"optimizer": {
|
||||
"type": "AdamW",
|
||||
"params": {
|
||||
|
|
|
@ -26,6 +26,7 @@ from types import SimpleNamespace
|
|||
|
||||
from .trainer_utils import SchedulerType
|
||||
from .utils import logging
|
||||
from .utils.versions import require_version
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
@ -281,6 +282,8 @@ def init_deepspeed(trainer, num_training_steps):
|
|||
"""
|
||||
import deepspeed
|
||||
|
||||
require_version("deepspeed>0.3.10")
|
||||
|
||||
args = trainer.args
|
||||
ds_config_file = args.deepspeed
|
||||
model = trainer.model
|
||||
|
@ -323,9 +326,8 @@ def init_deepspeed(trainer, num_training_steps):
|
|||
f"Keeping the `optimizer` config from {ds_config_file} intact, ignoring any optimizer-specific cl args"
|
||||
)
|
||||
else: # override only if the ds config doesn't already have this section
|
||||
# ds supports Adam, OneBitAdam, and Lamb optimizers and can import other optimizers from torch.
|
||||
# But trainer uses AdamW by default.
|
||||
# To use other optimizers so using a different scheduler requires voiding warranty with: `zero_allow_untested_optimizer`
|
||||
# ds supports Adam, AdamW, OneBitAdam, and Lamb optimizers and can import other optimizers from torch.
|
||||
# To use other optimizers requires voiding warranty with: `"zero_allow_untested_optimizer": true"`
|
||||
|
||||
optimizer_configs = {
|
||||
"AdamW": {
|
||||
|
@ -337,7 +339,6 @@ def init_deepspeed(trainer, num_training_steps):
|
|||
}
|
||||
optimizer = "AdamW"
|
||||
|
||||
config["zero_allow_untested_optimizer"] = True
|
||||
config["optimizer"] = {
|
||||
"type": optimizer,
|
||||
"params": optimizer_configs[optimizer],
|
||||
|
|
Loading…
Reference in New Issue