diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py index 02e9203e7a..c389257430 100644 --- a/examples/pytorch/summarization/run_summarization_no_trainer.py +++ b/examples/pytorch/summarization/run_summarization_no_trainer.py @@ -526,7 +526,7 @@ def main(): # Optimizer # Split weights in two groups, one with weight decay and the other not. - no_decay = ["bias", "LayerNorm.weight"] + no_decay = ["bias", "LayerNorm.weight", "layer_norm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], diff --git a/src/transformers/models/longt5/modeling_longt5.py b/src/transformers/models/longt5/modeling_longt5.py index bba8bb7fde..9b60689ca1 100644 --- a/src/transformers/models/longt5/modeling_longt5.py +++ b/src/transformers/models/longt5/modeling_longt5.py @@ -32,7 +32,8 @@ from ...modeling_outputs import ( Seq2SeqLMOutput, Seq2SeqModelOutput, ) -from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( DUMMY_INPUTS, DUMMY_MASK, @@ -260,6 +261,8 @@ except Exception: logger.warning("discovered apex but it failed to load, falling back to LongT5LayerNorm") pass +ALL_LAYERNORM_LAYERS.append(LongT5LayerNorm) + # Copied from transformers.models.t5.modeling_t5.T5DenseActDense with T5->LongT5 class LongT5DenseActDense(nn.Module): diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py index 3c0212e1c8..6be8a6e151 100644 --- a/src/transformers/models/t5/modeling_t5.py +++ b/src/transformers/models/t5/modeling_t5.py @@ -34,7 +34,7 @@ from ...modeling_outputs import ( Seq2SeqModelOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer +from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( DUMMY_INPUTS, DUMMY_MASK, @@ -275,6 +275,8 @@ except Exception: logger.warning("discovered apex but it failed to load, falling back to T5LayerNorm") pass +ALL_LAYERNORM_LAYERS.append(T5LayerNorm) + class T5DenseActDense(nn.Module): def __init__(self, config: T5Config): diff --git a/src/transformers/pytorch_utils.py b/src/transformers/pytorch_utils.py index ddfadfcbc0..c7bfba81fb 100644 --- a/src/transformers/pytorch_utils.py +++ b/src/transformers/pytorch_utils.py @@ -21,6 +21,8 @@ from torch import _softmax_backward_data, nn from .utils import logging +ALL_LAYERNORM_LAYERS = [nn.LayerNorm] + logger = logging.get_logger(__name__) is_torch_less_than_1_8 = version.parse(version.parse(torch.__version__).base_version) < version.parse("1.8.0") diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 6e3e847093..35342c0415 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -71,6 +71,7 @@ from .modelcard import TrainingSummary from .modeling_utils import PreTrainedModel, load_sharded_checkpoint, unwrap_model from .models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES from .optimization import Adafactor, get_scheduler +from .pytorch_utils import ALL_LAYERNORM_LAYERS from .tokenization_utils_base import PreTrainedTokenizerBase from .trainer_callback import ( CallbackHandler, @@ -967,7 +968,7 @@ class Trainer: opt_model = self.model_wrapped if is_sagemaker_mp_enabled() else self.model if self.optimizer is None: - decay_parameters = get_parameter_names(opt_model, [nn.LayerNorm]) + decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS) decay_parameters = [name for name in decay_parameters if "bias" not in name] optimizer_grouped_parameters = [ {