From abd7110e21102467448035ffdbf6b208a05ac80b Mon Sep 17 00:00:00 2001 From: Pasquale Minervini Date: Mon, 21 Oct 2019 19:56:52 +0100 Subject: [PATCH] gradient norm clipping should be done right before calling the optimiser - fixing run_glue and run_ner as well --- examples/run_glue.py | 7 +++++-- examples/run_ner.py | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/examples/run_glue.py b/examples/run_glue.py index 45924c9290..54f6689e4d 100644 --- a/examples/run_glue.py +++ b/examples/run_glue.py @@ -154,13 +154,16 @@ def train(args, train_dataset, model, tokenizer): if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() - torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() - torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0 and not args.tpu: + if args.fp16: + torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) + else: + torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) + optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() diff --git a/examples/run_ner.py b/examples/run_ner.py index fdf2f1924a..00eb039258 100644 --- a/examples/run_ner.py +++ b/examples/run_ner.py @@ -133,13 +133,16 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id): if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() - torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() - torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: + if args.fp16: + torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) + else: + torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) + scheduler.step() # Update learning rate schedule optimizer.step() model.zero_grad()