multi-gpu training also should be after apex fp16(squad)
This commit is contained in:
parent
adb3ef6368
commit
f0aeb7a814
|
@ -101,6 +101,10 @@ def train(args, train_dataset, model, tokenizer):
|
|||
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
|
||||
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
|
||||
|
||||
# multi-gpu training (should be after apex fp16 initialization)
|
||||
if args.n_gpu > 1:
|
||||
model = torch.nn.DataParallel(model)
|
||||
|
||||
# Distributed training (should be after apex fp16 initialization)
|
||||
if args.local_rank != -1:
|
||||
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
|
||||
|
@ -457,8 +461,6 @@ def main():
|
|||
torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
|
||||
|
||||
model.to(args.device)
|
||||
if args.n_gpu > 1:
|
||||
model = torch.nn.DataParallel(model)
|
||||
|
||||
logger.info("Training/evaluation parameters %s", args)
|
||||
|
||||
|
|
Loading…
Reference in New Issue