Prepare optimizer only when args.do_train is True
This commit is contained in:
parent
3ae8c8be1e
commit
74dbba64bc
|
@ -534,36 +534,37 @@ def main():
|
||||||
model = torch.nn.DataParallel(model)
|
model = torch.nn.DataParallel(model)
|
||||||
|
|
||||||
# Prepare optimizer
|
# Prepare optimizer
|
||||||
param_optimizer = list(model.named_parameters())
|
if args.do_train:
|
||||||
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
|
param_optimizer = list(model.named_parameters())
|
||||||
optimizer_grouped_parameters = [
|
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
|
||||||
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
|
optimizer_grouped_parameters = [
|
||||||
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
|
||||||
]
|
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||||
|
]
|
||||||
|
|
||||||
if args.fp16:
|
if args.fp16:
|
||||||
try:
|
try:
|
||||||
from apex.optimizers import FP16_Optimizer
|
from apex.optimizers import FP16_Optimizer
|
||||||
from apex.optimizers import FusedAdam
|
from apex.optimizers import FusedAdam
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
|
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
|
||||||
|
|
||||||
|
optimizer = FusedAdam(optimizer_grouped_parameters,
|
||||||
|
lr=args.learning_rate,
|
||||||
|
bias_correction=False,
|
||||||
|
max_grad_norm=1.0)
|
||||||
|
if args.loss_scale == 0:
|
||||||
|
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
|
||||||
|
else:
|
||||||
|
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
|
||||||
|
warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
|
||||||
|
t_total=num_train_optimization_steps)
|
||||||
|
|
||||||
optimizer = FusedAdam(optimizer_grouped_parameters,
|
|
||||||
lr=args.learning_rate,
|
|
||||||
bias_correction=False,
|
|
||||||
max_grad_norm=1.0)
|
|
||||||
if args.loss_scale == 0:
|
|
||||||
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
|
|
||||||
else:
|
else:
|
||||||
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
|
optimizer = BertAdam(optimizer_grouped_parameters,
|
||||||
warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
|
lr=args.learning_rate,
|
||||||
t_total=num_train_optimization_steps)
|
warmup=args.warmup_proportion,
|
||||||
|
t_total=num_train_optimization_steps)
|
||||||
else:
|
|
||||||
optimizer = BertAdam(optimizer_grouped_parameters,
|
|
||||||
lr=args.learning_rate,
|
|
||||||
warmup=args.warmup_proportion,
|
|
||||||
t_total=num_train_optimization_steps)
|
|
||||||
|
|
||||||
global_step = 0
|
global_step = 0
|
||||||
if args.do_train:
|
if args.do_train:
|
||||||
|
|
|
@ -763,35 +763,36 @@ def main():
|
||||||
model = torch.nn.DataParallel(model)
|
model = torch.nn.DataParallel(model)
|
||||||
|
|
||||||
# Prepare optimizer
|
# Prepare optimizer
|
||||||
param_optimizer = list(model.named_parameters())
|
if args.do_train:
|
||||||
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
|
param_optimizer = list(model.named_parameters())
|
||||||
optimizer_grouped_parameters = [
|
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
|
||||||
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
|
optimizer_grouped_parameters = [
|
||||||
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
|
||||||
]
|
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||||
if args.fp16:
|
]
|
||||||
try:
|
if args.fp16:
|
||||||
from apex.optimizers import FP16_Optimizer
|
try:
|
||||||
from apex.optimizers import FusedAdam
|
from apex.optimizers import FP16_Optimizer
|
||||||
except ImportError:
|
from apex.optimizers import FusedAdam
|
||||||
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
|
except ImportError:
|
||||||
|
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
|
||||||
|
|
||||||
|
optimizer = FusedAdam(optimizer_grouped_parameters,
|
||||||
|
lr=args.learning_rate,
|
||||||
|
bias_correction=False,
|
||||||
|
max_grad_norm=1.0)
|
||||||
|
if args.loss_scale == 0:
|
||||||
|
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
|
||||||
|
else:
|
||||||
|
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
|
||||||
|
warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
|
||||||
|
t_total=num_train_optimization_steps)
|
||||||
|
|
||||||
optimizer = FusedAdam(optimizer_grouped_parameters,
|
|
||||||
lr=args.learning_rate,
|
|
||||||
bias_correction=False,
|
|
||||||
max_grad_norm=1.0)
|
|
||||||
if args.loss_scale == 0:
|
|
||||||
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
|
|
||||||
else:
|
else:
|
||||||
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
|
optimizer = BertAdam(optimizer_grouped_parameters,
|
||||||
warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
|
lr=args.learning_rate,
|
||||||
t_total=num_train_optimization_steps)
|
warmup=args.warmup_proportion,
|
||||||
|
t_total=num_train_optimization_steps)
|
||||||
else:
|
|
||||||
optimizer = BertAdam(optimizer_grouped_parameters,
|
|
||||||
lr=args.learning_rate,
|
|
||||||
warmup=args.warmup_proportion,
|
|
||||||
t_total=num_train_optimization_steps)
|
|
||||||
|
|
||||||
global_step = 0
|
global_step = 0
|
||||||
nb_tr_steps = 0
|
nb_tr_steps = 0
|
||||||
|
|
|
@ -183,19 +183,20 @@ def main():
|
||||||
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
||||||
|
|
||||||
# Prepare optimizer
|
# Prepare optimizer
|
||||||
param_optimizer = list(model.named_parameters())
|
if args.do_train:
|
||||||
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
|
param_optimizer = list(model.named_parameters())
|
||||||
optimizer_grouped_parameters = [
|
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
|
||||||
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
|
optimizer_grouped_parameters = [
|
||||||
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
|
||||||
]
|
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||||
num_train_optimization_steps = len(train_data) * args.num_train_epochs // args.train_batch_size
|
]
|
||||||
optimizer = OpenAIAdam(optimizer_grouped_parameters,
|
num_train_optimization_steps = len(train_data) * args.num_train_epochs // args.train_batch_size
|
||||||
lr=args.learning_rate,
|
optimizer = OpenAIAdam(optimizer_grouped_parameters,
|
||||||
warmup=args.warmup_proportion,
|
lr=args.learning_rate,
|
||||||
max_grad_norm=args.max_grad_norm,
|
warmup=args.warmup_proportion,
|
||||||
weight_decay=args.weight_decay,
|
max_grad_norm=args.max_grad_norm,
|
||||||
t_total=num_train_optimization_steps)
|
weight_decay=args.weight_decay,
|
||||||
|
t_total=num_train_optimization_steps)
|
||||||
|
|
||||||
if args.do_train:
|
if args.do_train:
|
||||||
nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
|
nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
|
||||||
|
|
|
@ -922,40 +922,41 @@ def main():
|
||||||
model = torch.nn.DataParallel(model)
|
model = torch.nn.DataParallel(model)
|
||||||
|
|
||||||
# Prepare optimizer
|
# Prepare optimizer
|
||||||
param_optimizer = list(model.named_parameters())
|
if args.do_train:
|
||||||
|
param_optimizer = list(model.named_parameters())
|
||||||
|
|
||||||
# hack to remove pooler, which is not used
|
# hack to remove pooler, which is not used
|
||||||
# thus it produce None grad that break apex
|
# thus it produce None grad that break apex
|
||||||
param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
|
param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
|
||||||
|
|
||||||
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
|
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
|
||||||
optimizer_grouped_parameters = [
|
optimizer_grouped_parameters = [
|
||||||
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
|
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
|
||||||
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||||
]
|
]
|
||||||
|
|
||||||
if args.fp16:
|
if args.fp16:
|
||||||
try:
|
try:
|
||||||
from apex.optimizers import FP16_Optimizer
|
from apex.optimizers import FP16_Optimizer
|
||||||
from apex.optimizers import FusedAdam
|
from apex.optimizers import FusedAdam
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
|
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
|
||||||
|
|
||||||
optimizer = FusedAdam(optimizer_grouped_parameters,
|
optimizer = FusedAdam(optimizer_grouped_parameters,
|
||||||
lr=args.learning_rate,
|
lr=args.learning_rate,
|
||||||
bias_correction=False,
|
bias_correction=False,
|
||||||
max_grad_norm=1.0)
|
max_grad_norm=1.0)
|
||||||
if args.loss_scale == 0:
|
if args.loss_scale == 0:
|
||||||
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
|
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
|
||||||
|
else:
|
||||||
|
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
|
||||||
|
warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
|
||||||
|
t_total=num_train_optimization_steps)
|
||||||
else:
|
else:
|
||||||
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
|
optimizer = BertAdam(optimizer_grouped_parameters,
|
||||||
warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
|
lr=args.learning_rate,
|
||||||
t_total=num_train_optimization_steps)
|
warmup=args.warmup_proportion,
|
||||||
else:
|
t_total=num_train_optimization_steps)
|
||||||
optimizer = BertAdam(optimizer_grouped_parameters,
|
|
||||||
lr=args.learning_rate,
|
|
||||||
warmup=args.warmup_proportion,
|
|
||||||
t_total=num_train_optimization_steps)
|
|
||||||
|
|
||||||
global_step = 0
|
global_step = 0
|
||||||
if args.do_train:
|
if args.do_train:
|
||||||
|
|
|
@ -385,39 +385,40 @@ def main():
|
||||||
model = torch.nn.DataParallel(model)
|
model = torch.nn.DataParallel(model)
|
||||||
|
|
||||||
# Prepare optimizer
|
# Prepare optimizer
|
||||||
param_optimizer = list(model.named_parameters())
|
if args.do_train:
|
||||||
|
param_optimizer = list(model.named_parameters())
|
||||||
|
|
||||||
# hack to remove pooler, which is not used
|
# hack to remove pooler, which is not used
|
||||||
# thus it produce None grad that break apex
|
# thus it produce None grad that break apex
|
||||||
param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
|
param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
|
||||||
|
|
||||||
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
|
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
|
||||||
optimizer_grouped_parameters = [
|
optimizer_grouped_parameters = [
|
||||||
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
|
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
|
||||||
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||||
]
|
]
|
||||||
if args.fp16:
|
if args.fp16:
|
||||||
try:
|
try:
|
||||||
from apex.optimizers import FP16_Optimizer
|
from apex.optimizers import FP16_Optimizer
|
||||||
from apex.optimizers import FusedAdam
|
from apex.optimizers import FusedAdam
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
|
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
|
||||||
|
|
||||||
optimizer = FusedAdam(optimizer_grouped_parameters,
|
optimizer = FusedAdam(optimizer_grouped_parameters,
|
||||||
lr=args.learning_rate,
|
lr=args.learning_rate,
|
||||||
bias_correction=False,
|
bias_correction=False,
|
||||||
max_grad_norm=1.0)
|
max_grad_norm=1.0)
|
||||||
if args.loss_scale == 0:
|
if args.loss_scale == 0:
|
||||||
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
|
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
|
||||||
|
else:
|
||||||
|
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
|
||||||
|
warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
|
||||||
|
t_total=num_train_optimization_steps)
|
||||||
else:
|
else:
|
||||||
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
|
optimizer = BertAdam(optimizer_grouped_parameters,
|
||||||
warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
|
lr=args.learning_rate,
|
||||||
t_total=num_train_optimization_steps)
|
warmup=args.warmup_proportion,
|
||||||
else:
|
t_total=num_train_optimization_steps)
|
||||||
optimizer = BertAdam(optimizer_grouped_parameters,
|
|
||||||
lr=args.learning_rate,
|
|
||||||
warmup=args.warmup_proportion,
|
|
||||||
t_total=num_train_optimization_steps)
|
|
||||||
|
|
||||||
global_step = 0
|
global_step = 0
|
||||||
if args.do_train:
|
if args.do_train:
|
||||||
|
|
Loading…
Reference in New Issue