Prepare optimizer only when args.do_train is True

This commit is contained in:
MottoX 2019-05-02 19:09:29 +08:00
parent 3ae8c8be1e
commit 74dbba64bc
5 changed files with 130 additions and 125 deletions

View File

@ -534,36 +534,37 @@ def main():
model = torch.nn.DataParallel(model) model = torch.nn.DataParallel(model)
# Prepare optimizer # Prepare optimizer
param_optimizer = list(model.named_parameters()) if args.do_train:
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] param_optimizer = list(model.named_parameters())
optimizer_grouped_parameters = [ no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
] {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
if args.fp16: if args.fp16:
try: try:
from apex.optimizers import FP16_Optimizer from apex.optimizers import FP16_Optimizer
from apex.optimizers import FusedAdam from apex.optimizers import FusedAdam
except ImportError: except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
optimizer = FusedAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
bias_correction=False,
max_grad_norm=1.0)
if args.loss_scale == 0:
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
else:
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)
optimizer = FusedAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
bias_correction=False,
max_grad_norm=1.0)
if args.loss_scale == 0:
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
else: else:
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) optimizer = BertAdam(optimizer_grouped_parameters,
warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, lr=args.learning_rate,
t_total=num_train_optimization_steps) warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)
else:
optimizer = BertAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)
global_step = 0 global_step = 0
if args.do_train: if args.do_train:

View File

@ -763,35 +763,36 @@ def main():
model = torch.nn.DataParallel(model) model = torch.nn.DataParallel(model)
# Prepare optimizer # Prepare optimizer
param_optimizer = list(model.named_parameters()) if args.do_train:
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] param_optimizer = list(model.named_parameters())
optimizer_grouped_parameters = [ no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
] {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
if args.fp16: ]
try: if args.fp16:
from apex.optimizers import FP16_Optimizer try:
from apex.optimizers import FusedAdam from apex.optimizers import FP16_Optimizer
except ImportError: from apex.optimizers import FusedAdam
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
optimizer = FusedAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
bias_correction=False,
max_grad_norm=1.0)
if args.loss_scale == 0:
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
else:
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)
optimizer = FusedAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
bias_correction=False,
max_grad_norm=1.0)
if args.loss_scale == 0:
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
else: else:
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) optimizer = BertAdam(optimizer_grouped_parameters,
warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, lr=args.learning_rate,
t_total=num_train_optimization_steps) warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)
else:
optimizer = BertAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)
global_step = 0 global_step = 0
nb_tr_steps = 0 nb_tr_steps = 0

View File

@ -183,19 +183,20 @@ def main():
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
# Prepare optimizer # Prepare optimizer
param_optimizer = list(model.named_parameters()) if args.do_train:
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] param_optimizer = list(model.named_parameters())
optimizer_grouped_parameters = [ no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
] {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
num_train_optimization_steps = len(train_data) * args.num_train_epochs // args.train_batch_size ]
optimizer = OpenAIAdam(optimizer_grouped_parameters, num_train_optimization_steps = len(train_data) * args.num_train_epochs // args.train_batch_size
lr=args.learning_rate, optimizer = OpenAIAdam(optimizer_grouped_parameters,
warmup=args.warmup_proportion, lr=args.learning_rate,
max_grad_norm=args.max_grad_norm, warmup=args.warmup_proportion,
weight_decay=args.weight_decay, max_grad_norm=args.max_grad_norm,
t_total=num_train_optimization_steps) weight_decay=args.weight_decay,
t_total=num_train_optimization_steps)
if args.do_train: if args.do_train:
nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None

View File

@ -922,40 +922,41 @@ def main():
model = torch.nn.DataParallel(model) model = torch.nn.DataParallel(model)
# Prepare optimizer # Prepare optimizer
param_optimizer = list(model.named_parameters()) if args.do_train:
param_optimizer = list(model.named_parameters())
# hack to remove pooler, which is not used # hack to remove pooler, which is not used
# thus it produce None grad that break apex # thus it produce None grad that break apex
param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [ optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
] ]
if args.fp16: if args.fp16:
try: try:
from apex.optimizers import FP16_Optimizer from apex.optimizers import FP16_Optimizer
from apex.optimizers import FusedAdam from apex.optimizers import FusedAdam
except ImportError: except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
optimizer = FusedAdam(optimizer_grouped_parameters, optimizer = FusedAdam(optimizer_grouped_parameters,
lr=args.learning_rate, lr=args.learning_rate,
bias_correction=False, bias_correction=False,
max_grad_norm=1.0) max_grad_norm=1.0)
if args.loss_scale == 0: if args.loss_scale == 0:
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
else:
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)
else: else:
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) optimizer = BertAdam(optimizer_grouped_parameters,
warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, lr=args.learning_rate,
t_total=num_train_optimization_steps) warmup=args.warmup_proportion,
else: t_total=num_train_optimization_steps)
optimizer = BertAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)
global_step = 0 global_step = 0
if args.do_train: if args.do_train:

View File

@ -385,39 +385,40 @@ def main():
model = torch.nn.DataParallel(model) model = torch.nn.DataParallel(model)
# Prepare optimizer # Prepare optimizer
param_optimizer = list(model.named_parameters()) if args.do_train:
param_optimizer = list(model.named_parameters())
# hack to remove pooler, which is not used # hack to remove pooler, which is not used
# thus it produce None grad that break apex # thus it produce None grad that break apex
param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [ optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
] ]
if args.fp16: if args.fp16:
try: try:
from apex.optimizers import FP16_Optimizer from apex.optimizers import FP16_Optimizer
from apex.optimizers import FusedAdam from apex.optimizers import FusedAdam
except ImportError: except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
optimizer = FusedAdam(optimizer_grouped_parameters, optimizer = FusedAdam(optimizer_grouped_parameters,
lr=args.learning_rate, lr=args.learning_rate,
bias_correction=False, bias_correction=False,
max_grad_norm=1.0) max_grad_norm=1.0)
if args.loss_scale == 0: if args.loss_scale == 0:
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
else:
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)
else: else:
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) optimizer = BertAdam(optimizer_grouped_parameters,
warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, lr=args.learning_rate,
t_total=num_train_optimization_steps) warmup=args.warmup_proportion,
else: t_total=num_train_optimization_steps)
optimizer = BertAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)
global_step = 0 global_step = 0
if args.do_train: if args.do_train: