lr_schedulers: add get_polynomial_decay_schedule_with_warmup (#6361)
* [wip] add get_polynomial_decay_schedule_with_warmup * style * add assert * change lr_end to a much smaller default number * check for exact equality * [model_cards] electra-base-turkish-cased-ner (#6350) * for electra-base-turkish-cased-ner * Add metadata Co-authored-by: Julien Chaumond <chaumond@gmail.com> * Temporarily de-activate TPU CI * Update modeling_tf_utils.py (#6372) fix typo: ckeckpoint->checkpoint * the test now works again (#6371) * correct pl link in readme (#6364) * refactor almost identical tests (#6339) * refactor almost identical tests * important to add a clear assert error message * make the assert error even more descriptive than the original bt * Small docfile fixes (#6328) * Patch models (#6326) * TFAlbertFor{TokenClassification, MultipleChoice} * Patch models * BERT and TF BERT info s * Update check_repo * Ci GitHub caching (#6382) * Cache Github Actions CI * Remove useless file * Colab button (#6389) * Add colab button * Add colab link for tutorials * Fix links for open in colab (#6391) * Update src/transformers/optimization.py consistently use lr_end=1e-7 default Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * [wip] add get_polynomial_decay_schedule_with_warmup * style * add assert * change lr_end to a much smaller default number * check for exact equality * Update src/transformers/optimization.py consistently use lr_end=1e-7 default Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * remove dup (leftover from merge) * convert the test into the new refactored format * stick to using the current_step as is, without ++ Co-authored-by: M. Yusuf Sarıgöz <yusufsarigoz@gmail.com> Co-authored-by: Julien Chaumond <chaumond@gmail.com> Co-authored-by: Lysandre <lysandre.debut@reseau.eseo.fr> Co-authored-by: Alexander Measure <ameasure@gmail.com> Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
This commit is contained in:
parent
6c87b73d6b
commit
ece0903e11
|
@ -25,6 +25,7 @@ from transformers.optimization import (
|
||||||
get_cosine_schedule_with_warmup,
|
get_cosine_schedule_with_warmup,
|
||||||
get_cosine_with_hard_restarts_schedule_with_warmup,
|
get_cosine_with_hard_restarts_schedule_with_warmup,
|
||||||
get_linear_schedule_with_warmup,
|
get_linear_schedule_with_warmup,
|
||||||
|
get_polynomial_decay_schedule_with_warmup,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -48,7 +49,7 @@ arg_to_scheduler = {
|
||||||
"linear": get_linear_schedule_with_warmup,
|
"linear": get_linear_schedule_with_warmup,
|
||||||
"cosine": get_cosine_schedule_with_warmup,
|
"cosine": get_cosine_schedule_with_warmup,
|
||||||
"cosine_w_restarts": get_cosine_with_hard_restarts_schedule_with_warmup,
|
"cosine_w_restarts": get_cosine_with_hard_restarts_schedule_with_warmup,
|
||||||
# polynomial': '', # TODO
|
"polynomial": get_polynomial_decay_schedule_with_warmup,
|
||||||
# '': get_constant_schedule, # not supported for now
|
# '': get_constant_schedule, # not supported for now
|
||||||
# '': get_constant_schedule_with_warmup, # not supported for now
|
# '': get_constant_schedule_with_warmup, # not supported for now
|
||||||
}
|
}
|
||||||
|
|
|
@ -428,6 +428,7 @@ if is_torch_available():
|
||||||
get_cosine_schedule_with_warmup,
|
get_cosine_schedule_with_warmup,
|
||||||
get_cosine_with_hard_restarts_schedule_with_warmup,
|
get_cosine_with_hard_restarts_schedule_with_warmup,
|
||||||
get_linear_schedule_with_warmup,
|
get_linear_schedule_with_warmup,
|
||||||
|
get_polynomial_decay_schedule_with_warmup,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Trainer
|
# Trainer
|
||||||
|
|
|
@ -165,6 +165,52 @@ def get_cosine_with_hard_restarts_schedule_with_warmup(
|
||||||
return LambdaLR(optimizer, lr_lambda, last_epoch)
|
return LambdaLR(optimizer, lr_lambda, last_epoch)
|
||||||
|
|
||||||
|
|
||||||
|
def get_polynomial_decay_schedule_with_warmup(
|
||||||
|
optimizer, num_warmup_steps, num_training_steps, lr_end=1e-7, power=2.0, last_epoch=-1
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Create a schedule with a learning rate that decreases as a polynomial decay
|
||||||
|
from the initial lr set in the optimizer to end lr defined by `lr_end`,
|
||||||
|
after a warmup period during which it increases linearly from 0 to the
|
||||||
|
initial lr set in the optimizer.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
optimizer (:class:`~torch.optim.Optimizer`):
|
||||||
|
The optimizer for which to schedule the learning rate.
|
||||||
|
num_warmup_steps (:obj:`int`):
|
||||||
|
The number of steps for the warmup phase.
|
||||||
|
num_training_steps (:obj:`int`):
|
||||||
|
The total number of training steps.
|
||||||
|
lr_end (:obj:`float`, `optional`, defaults to 1e-7):
|
||||||
|
The end LR.
|
||||||
|
power (:obj:`float`, `optional`, defaults to 1.0):
|
||||||
|
Power factor.
|
||||||
|
last_epoch (:obj:`int`, `optional`, defaults to -1):
|
||||||
|
The index of the last epoch when resuming training.
|
||||||
|
|
||||||
|
Return:
|
||||||
|
:obj:`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
lr_init = optimizer.defaults["lr"]
|
||||||
|
assert lr_init > lr_end, f"lr_end ({lr_end}) must be be smaller than initial lr ({lr_init})"
|
||||||
|
|
||||||
|
def lr_lambda(current_step: int):
|
||||||
|
if current_step < num_warmup_steps:
|
||||||
|
return float(current_step) / float(max(1, num_warmup_steps))
|
||||||
|
elif current_step > num_training_steps:
|
||||||
|
return lr_end / lr_init # as LambdaLR multiplies by lr_init
|
||||||
|
else:
|
||||||
|
lr_range = lr_init - lr_end
|
||||||
|
decay_steps = num_training_steps - num_warmup_steps
|
||||||
|
pct_remaining = 1 - (current_step - num_warmup_steps) / decay_steps
|
||||||
|
decay = lr_range * pct_remaining ** power + lr_end
|
||||||
|
return decay / lr_init # as LambdaLR multiplies by lr_init
|
||||||
|
|
||||||
|
return LambdaLR(optimizer, lr_lambda, last_epoch)
|
||||||
|
|
||||||
|
|
||||||
class AdamW(Optimizer):
|
class AdamW(Optimizer):
|
||||||
"""
|
"""
|
||||||
Implements Adam algorithm with weight decay fix as introduced in
|
Implements Adam algorithm with weight decay fix as introduced in
|
||||||
|
|
|
@ -32,6 +32,7 @@ if is_torch_available():
|
||||||
get_cosine_schedule_with_warmup,
|
get_cosine_schedule_with_warmup,
|
||||||
get_cosine_with_hard_restarts_schedule_with_warmup,
|
get_cosine_with_hard_restarts_schedule_with_warmup,
|
||||||
get_linear_schedule_with_warmup,
|
get_linear_schedule_with_warmup,
|
||||||
|
get_polynomial_decay_schedule_with_warmup,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -114,6 +115,10 @@ class ScheduleInitTest(unittest.TestCase):
|
||||||
{**common_kwargs, "num_cycles": 2},
|
{**common_kwargs, "num_cycles": 2},
|
||||||
[5.0, 10.0, 8.53, 5.0, 1.46, 10.0, 8.53, 5.0, 1.46, 0.0],
|
[5.0, 10.0, 8.53, 5.0, 1.46, 10.0, 8.53, 5.0, 1.46, 0.0],
|
||||||
),
|
),
|
||||||
|
get_polynomial_decay_schedule_with_warmup: (
|
||||||
|
{**common_kwargs, "power": 2.0, "lr_end": 1e-7},
|
||||||
|
[5.0, 10.0, 7.656, 5.625, 3.906, 2.5, 1.406, 0.625, 0.156, 1e-07],
|
||||||
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
for scheduler_func, data in scheds.items():
|
for scheduler_func, data in scheds.items():
|
||||||
|
|
Loading…
Reference in New Issue