gradient norm clipping should be done right before calling the optimiser
This commit is contained in:
parent
bf2c36a920
commit
3775550c4b
|
@ -157,13 +157,16 @@ def train(args, train_dataset, model, tokenizer):
|
||||||
if args.fp16:
|
if args.fp16:
|
||||||
with amp.scale_loss(loss, optimizer) as scaled_loss:
|
with amp.scale_loss(loss, optimizer) as scaled_loss:
|
||||||
scaled_loss.backward()
|
scaled_loss.backward()
|
||||||
torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
|
|
||||||
else:
|
else:
|
||||||
loss.backward()
|
loss.backward()
|
||||||
torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
|
|
||||||
|
|
||||||
tr_loss += loss.item()
|
tr_loss += loss.item()
|
||||||
if (step + 1) % args.gradient_accumulation_steps == 0:
|
if (step + 1) % args.gradient_accumulation_steps == 0:
|
||||||
|
if args.fp16:
|
||||||
|
torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
|
||||||
|
else:
|
||||||
|
torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
|
||||||
|
|
||||||
optimizer.step()
|
optimizer.step()
|
||||||
scheduler.step() # Update learning rate schedule
|
scheduler.step() # Update learning rate schedule
|
||||||
model.zero_grad()
|
model.zero_grad()
|
||||||
|
|
Loading…
Reference in New Issue