Trainer with grad accum (#6930)

* Add warning for gradient accumulation * Formatting
2020-09-07 04:54:00 -04:00 · 2020-09-07 04:54:00 -04:00 · 08de989a0a
parent d4aa7284c8
commit 08de989a0a
2 changed files with 12 additions and 0 deletions
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@ -60,6 +60,12 @@ class TrainingArguments:
            The batch size per GPU/TPU core/CPU for evaluation.
        gradient_accumulation_steps: (:obj:`int`, `optional`, defaults to 1):
            Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
+
+            .. warning::
+
+                When using gradient accumulation, one step is counted as one step with backward pass. Therefore,
+                logging, evaluation, save will be conducted every ``gradient_accumulation_steps * xxx_step`` training
+                examples.
        learning_rate (:obj:`float`, `optional`, defaults to 5e-5):
            The initial learning rate for Adam.
        weight_decay (:obj:`float`, `optional`, defaults to 0):
--- a/src/transformers/training_args_tf.py
+++ b/src/transformers/training_args_tf.py
@ -42,6 +42,12 @@ class TFTrainingArguments(TrainingArguments):
            The batch size per GPU/TPU core/CPU for evaluation.
        gradient_accumulation_steps: (:obj:`int`, `optional`, defaults to 1):
            Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
+
+            .. warning::
+
+                When using gradient accumulation, one step is counted as one step with backward pass. Therefore,
+                logging, evaluation, save will be conducted every ``gradient_accumulation_steps * xxx_step`` training
+                examples.
        learning_rate (:obj:`float`, `optional`, defaults to 5e-5):
            The initial learning rate for Adam.
        weight_decay (:obj:`float`, `optional`, defaults to 0):