diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index 8ac8eb88a0..852c97d746 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -147,6 +147,10 @@ class TrainingArguments: fp16_opt_level (:obj:`str`, `optional`, defaults to 'O1'): For :obj:`fp16` training, Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details on the `Apex documentation `__. + fp16_backend (:obj:`str`, `optional`, defaults to :obj:`"auto"`): + The backend to use for mixed precision training. Must be one of :obj:`"auto"`, :obj:`"amp"` or + :obj:`"apex"`. :obj:`"auto"` will use AMP or APEX depending on the PyTorch version detected, while the + other choices will force the requested backend. local_rank (:obj:`int`, `optional`, defaults to -1): Rank of the process during distributed training. tpu_num_cores (:obj:`int`, `optional`): @@ -213,10 +217,6 @@ class TrainingArguments: When resuming training, whether or not to skip the epochs and batches to get the data loading at the same stage as in the previous training. If set to :obj:`True`, the training will begin faster (as that skipping step can take a long time) but will not yield the same results as the interrupted training would have. - fp16_backend (:obj:`str`, `optional`, defaults to :obj:`"auto"`): - The backend to use for mixed precision training. Must be one of :obj:`"auto"`, :obj:`"amp"` or - :obj:`"apex"`. :obj:`"auto"` will use AMP or APEX depending on the PyTorch version detected, while the - other choices will force the requested backend. sharded_ddp (:obj:`bool`, `optional`, defaults to :obj:`False`): Use Sharded DDP training from `FairScale `__ (in distributed training only). This is an experimental feature. @@ -341,6 +341,10 @@ class TrainingArguments: ) }, ) + fp16_backend: str = field( + default="auto", + metadata={"help": "The backend to be used for mixed precision.", "choices": ["auto", "amp", "apex"]}, + ) local_rank: int = field(default=-1, metadata={"help": "For distributed training: local_rank"}) tpu_num_cores: Optional[int] = field( @@ -398,10 +402,6 @@ class TrainingArguments: "help": "When resuming training, whether or not to skip the first epochs and batches to get to the same training data." }, ) - fp16_backend: str = field( - default="auto", - metadata={"help": "The backend to be used for mixed precision.", "choices": ["auto", "amp", "apex"]}, - ) sharded_ddp: bool = field( default=False, metadata={"help": "Whether or not to use sharded DDP training (in distributed training only)."},