From cf0755aa6e4e660563aa1a7b16533b55633213b1 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Wed, 21 Jul 2021 09:36:02 -0700 Subject: [PATCH] [debug] DebugUnderflowOverflow doesn't work with DP (#12816) --- docs/source/debugging.rst | 6 +++++- src/transformers/trainer.py | 9 ++++++++- src/transformers/trainer_utils.py | 4 ++-- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/docs/source/debugging.rst b/docs/source/debugging.rst index b13dc1a5e7..de445b2955 100644 --- a/docs/source/debugging.rst +++ b/docs/source/debugging.rst @@ -24,7 +24,11 @@ Underflow and Overflow Detection .. note:: - This feature can be used with any ``nn.Module``-based model + For multi-GPU training it requires DDP (``torch.distributed.launch``). + +.. note:: + + This feature can be used with any ``nn.Module``-based model. If you start getting ``loss=NaN`` or the model inhibits some other abnormal behavior due to ``inf`` or ``nan`` in activations or weights one needs to discover where the first underflow or overflow happens and what led to it. Luckily diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 7a5c55207d..aab6a77888 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -1114,7 +1114,14 @@ class Trainer: num_train_samples = args.max_steps * total_train_batch_size if DebugOption.UNDERFLOW_OVERFLOW in self.args.debug: - debug_overflow = DebugUnderflowOverflow(self.model) # noqa + if self.args.n_gpu > 1: + # nn.DataParallel(model) replicates the model, creating new variables and module + # references registered here no longer work on other gpus, breaking the module + raise ValueError( + "Currently --debug underflow_overflow is not supported under DP. Please use DDP (torch.distributed.launch)." + ) + else: + debug_overflow = DebugUnderflowOverflow(self.model) # noqa delay_optimizer_creation = self.sharded_ddp is not None and self.sharded_ddp != ShardedDDPOption.SIMPLE if args.deepspeed: diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py index 926006e1b3..d26217de7a 100644 --- a/src/transformers/trainer_utils.py +++ b/src/transformers/trainer_utils.py @@ -420,7 +420,7 @@ class TrainerMemoryTracker: self.cur_stage = None def update_metrics(self, stage, metrics): - """stop tracking for the passed stage""" + """updates the metrics""" if self.skip_memory_metrics: return @@ -442,7 +442,7 @@ class TrainerMemoryTracker: metrics[f"{stage}_mem_gpu_{t}_delta"] = self.gpu[stage][t] def stop_and_update_metrics(self, metrics=None): - """combine stop + update in one call for simpler code""" + """combine stop and metrics update in one call for simpler code""" if self.skip_memory_metrics: return