Revert low cpu mem tie weights (#29135)
* Revert "Add tie_weights() to LM heads and set bias in set_output_embeddings() (#28948)" This reverts commit725f4ad1cc
. * Revert "Patch to skip failing `test_save_load_low_cpu_mem_usage` tests (#29043)" This reverts commit4156f517ce
.
This commit is contained in:
parent
15cfe38942
commit
0996a10077
|
@ -692,9 +692,6 @@ class BertLMPredictionHead(nn.Module):
|
|||
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def _tie_weights(self):
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states = self.transform(hidden_states)
|
||||
hidden_states = self.decoder(hidden_states)
|
||||
|
@ -1065,7 +1062,6 @@ class BertForPreTraining(BertPreTrainedModel):
|
|||
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.cls.predictions.decoder = new_embeddings
|
||||
self.cls.predictions.bias = new_embeddings.bias
|
||||
|
||||
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@replace_return_docstrings(output_type=BertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
|
||||
|
@ -1175,7 +1171,6 @@ class BertLMHeadModel(BertPreTrainedModel):
|
|||
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.cls.predictions.decoder = new_embeddings
|
||||
self.cls.predictions.bias = new_embeddings.bias
|
||||
|
||||
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@add_code_sample_docstrings(
|
||||
|
@ -1329,7 +1324,6 @@ class BertForMaskedLM(BertPreTrainedModel):
|
|||
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.cls.predictions.decoder = new_embeddings
|
||||
self.cls.predictions.bias = new_embeddings.bias
|
||||
|
||||
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@add_code_sample_docstrings(
|
||||
|
|
|
@ -1707,9 +1707,6 @@ class BigBirdLMPredictionHead(nn.Module):
|
|||
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def _tie_weights(self):
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states = self.transform(hidden_states)
|
||||
hidden_states = self.decoder(hidden_states)
|
||||
|
@ -2269,7 +2266,6 @@ class BigBirdForPreTraining(BigBirdPreTrainedModel):
|
|||
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.cls.predictions.decoder = new_embeddings
|
||||
self.cls.predictions.bias = new_embeddings.bias
|
||||
|
||||
@add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@replace_return_docstrings(output_type=BigBirdForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
|
||||
|
@ -2382,7 +2378,6 @@ class BigBirdForMaskedLM(BigBirdPreTrainedModel):
|
|||
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.cls.predictions.decoder = new_embeddings
|
||||
self.cls.predictions.bias = new_embeddings.bias
|
||||
|
||||
@add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
|
||||
|
@ -2524,7 +2519,6 @@ class BigBirdForCausalLM(BigBirdPreTrainedModel):
|
|||
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.cls.predictions.decoder = new_embeddings
|
||||
self.cls.predictions.bias = new_embeddings.bias
|
||||
|
||||
@add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@add_code_sample_docstrings(
|
||||
|
|
|
@ -523,9 +523,6 @@ class BlipTextLMPredictionHead(nn.Module):
|
|||
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def _tie_weights(self):
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states = self.transform(hidden_states)
|
||||
hidden_states = self.decoder(hidden_states)
|
||||
|
@ -820,7 +817,6 @@ class BlipTextLMHeadModel(BlipTextPreTrainedModel):
|
|||
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.cls.predictions.decoder = new_embeddings
|
||||
self.cls.predictions.bias = new_embeddings.bias
|
||||
|
||||
def forward(
|
||||
self,
|
||||
|
|
|
@ -608,9 +608,6 @@ class ErnieLMPredictionHead(nn.Module):
|
|||
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def _tie_weights(self):
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states = self.transform(hidden_states)
|
||||
hidden_states = self.decoder(hidden_states)
|
||||
|
@ -998,7 +995,6 @@ class ErnieForPreTraining(ErniePreTrainedModel):
|
|||
# Copied from transformers.models.bert.modeling_bert.BertForPreTraining.set_output_embeddings
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.cls.predictions.decoder = new_embeddings
|
||||
self.cls.predictions.bias = new_embeddings.bias
|
||||
|
||||
@add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@replace_return_docstrings(output_type=ErnieForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
|
||||
|
@ -1113,7 +1109,6 @@ class ErnieForCausalLM(ErniePreTrainedModel):
|
|||
# Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.set_output_embeddings
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.cls.predictions.decoder = new_embeddings
|
||||
self.cls.predictions.bias = new_embeddings.bias
|
||||
|
||||
@add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@add_code_sample_docstrings(
|
||||
|
@ -1274,7 +1269,6 @@ class ErnieForMaskedLM(ErniePreTrainedModel):
|
|||
# Copied from transformers.models.bert.modeling_bert.BertForMaskedLM.set_output_embeddings
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.cls.predictions.decoder = new_embeddings
|
||||
self.cls.predictions.bias = new_embeddings.bias
|
||||
|
||||
@add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@add_code_sample_docstrings(
|
||||
|
|
|
@ -589,9 +589,6 @@ class LayoutLMLMPredictionHead(nn.Module):
|
|||
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def _tie_weights(self):
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states = self.transform(hidden_states)
|
||||
hidden_states = self.decoder(hidden_states)
|
||||
|
@ -872,7 +869,6 @@ class LayoutLMForMaskedLM(LayoutLMPreTrainedModel):
|
|||
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.cls.predictions.decoder = new_embeddings
|
||||
self.cls.predictions.bias = new_embeddings.bias
|
||||
|
||||
@add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
|
||||
|
|
|
@ -318,9 +318,6 @@ class MarkupLMLMPredictionHead(nn.Module):
|
|||
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def _tie_weights(self):
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states = self.transform(hidden_states)
|
||||
hidden_states = self.decoder(hidden_states)
|
||||
|
|
|
@ -659,9 +659,6 @@ class MegatronBertLMPredictionHead(nn.Module):
|
|||
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def _tie_weights(self):
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states = self.transform(hidden_states)
|
||||
hidden_states = self.decoder(hidden_states)
|
||||
|
@ -1026,7 +1023,6 @@ class MegatronBertForPreTraining(MegatronBertPreTrainedModel):
|
|||
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.cls.predictions.decoder = new_embeddings
|
||||
self.cls.predictions.bias = new_embeddings.bias
|
||||
|
||||
@add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@replace_return_docstrings(output_type=MegatronBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
|
||||
|
@ -1136,7 +1132,6 @@ class MegatronBertForCausalLM(MegatronBertPreTrainedModel):
|
|||
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.cls.predictions.decoder = new_embeddings
|
||||
self.cls.predictions.bias = new_embeddings.bias
|
||||
|
||||
@add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
|
||||
|
@ -1295,7 +1290,6 @@ class MegatronBertForMaskedLM(MegatronBertPreTrainedModel):
|
|||
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.cls.predictions.decoder = new_embeddings
|
||||
self.cls.predictions.bias = new_embeddings.bias
|
||||
|
||||
@add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@add_code_sample_docstrings(
|
||||
|
|
|
@ -587,7 +587,6 @@ class MPNetForMaskedLM(MPNetPreTrainedModel):
|
|||
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.lm_head.decoder = new_embeddings
|
||||
self.lm_head.bias = new_embeddings.bias
|
||||
|
||||
@add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@add_code_sample_docstrings(
|
||||
|
@ -660,9 +659,6 @@ class MPNetLMHead(nn.Module):
|
|||
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def _tie_weights(self):
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def forward(self, features, **kwargs):
|
||||
x = self.dense(features)
|
||||
x = gelu(x)
|
||||
|
|
|
@ -810,9 +810,6 @@ class MraLMPredictionHead(nn.Module):
|
|||
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def _tie_weights(self):
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states = self.transform(hidden_states)
|
||||
hidden_states = self.decoder(hidden_states)
|
||||
|
@ -1046,7 +1043,6 @@ class MraForMaskedLM(MraPreTrainedModel):
|
|||
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.cls.predictions.decoder = new_embeddings
|
||||
self.cls.predictions.bias = new_embeddings.bias
|
||||
|
||||
@add_start_docstrings_to_model_forward(MRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@add_code_sample_docstrings(
|
||||
|
|
|
@ -679,9 +679,6 @@ class NezhaLMPredictionHead(nn.Module):
|
|||
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def _tie_weights(self):
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states = self.transform(hidden_states)
|
||||
hidden_states = self.decoder(hidden_states)
|
||||
|
@ -1047,7 +1044,6 @@ class NezhaForPreTraining(NezhaPreTrainedModel):
|
|||
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.cls.predictions.decoder = new_embeddings
|
||||
self.cls.predictions.bias = new_embeddings.bias
|
||||
|
||||
@add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@replace_return_docstrings(output_type=NezhaForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
|
||||
|
@ -1156,7 +1152,6 @@ class NezhaForMaskedLM(NezhaPreTrainedModel):
|
|||
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.cls.predictions.decoder = new_embeddings
|
||||
self.cls.predictions.bias = new_embeddings.bias
|
||||
|
||||
@add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@add_code_sample_docstrings(
|
||||
|
|
|
@ -428,9 +428,6 @@ class NystromformerLMPredictionHead(nn.Module):
|
|||
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def _tie_weights(self):
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states = self.transform(hidden_states)
|
||||
hidden_states = self.decoder(hidden_states)
|
||||
|
@ -669,7 +666,6 @@ class NystromformerForMaskedLM(NystromformerPreTrainedModel):
|
|||
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.cls.predictions.decoder = new_embeddings
|
||||
self.cls.predictions.bias = new_embeddings.bias
|
||||
|
||||
@add_start_docstrings_to_model_forward(NYSTROMFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@add_code_sample_docstrings(
|
||||
|
|
|
@ -683,9 +683,6 @@ class QDQBertLMPredictionHead(nn.Module):
|
|||
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def _tie_weights(self):
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states = self.transform(hidden_states)
|
||||
hidden_states = self.decoder(hidden_states)
|
||||
|
@ -1027,7 +1024,6 @@ class QDQBertLMHeadModel(QDQBertPreTrainedModel):
|
|||
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.cls.predictions.decoder = new_embeddings
|
||||
self.cls.predictions.bias = new_embeddings.bias
|
||||
|
||||
@add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
|
||||
|
@ -1194,7 +1190,6 @@ class QDQBertForMaskedLM(QDQBertPreTrainedModel):
|
|||
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.cls.predictions.decoder = new_embeddings
|
||||
self.cls.predictions.bias = new_embeddings.bias
|
||||
|
||||
@add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@add_code_sample_docstrings(
|
||||
|
|
|
@ -744,9 +744,6 @@ class RoCBertLMPredictionHead(nn.Module):
|
|||
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def _tie_weights(self):
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states = self.transform(hidden_states)
|
||||
hidden_states = self.decoder(hidden_states)
|
||||
|
@ -1093,7 +1090,6 @@ class RoCBertForPreTraining(RoCBertPreTrainedModel):
|
|||
# Copied from transformers.models.bert.modeling_bert.BertForPreTraining.set_output_embeddings
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.cls.predictions.decoder = new_embeddings
|
||||
self.cls.predictions.bias = new_embeddings.bias
|
||||
|
||||
@add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
|
||||
|
@ -1286,7 +1282,6 @@ class RoCBertForMaskedLM(RoCBertPreTrainedModel):
|
|||
# Copied from transformers.models.bert.modeling_bert.BertForMaskedLM.set_output_embeddings
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.cls.predictions.decoder = new_embeddings
|
||||
self.cls.predictions.bias = new_embeddings.bias
|
||||
|
||||
@add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
def forward(
|
||||
|
@ -1424,7 +1419,6 @@ class RoCBertForCausalLM(RoCBertPreTrainedModel):
|
|||
# Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.set_output_embeddings
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.cls.predictions.decoder = new_embeddings
|
||||
self.cls.predictions.bias = new_embeddings.bias
|
||||
|
||||
@add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
|
||||
|
|
|
@ -729,9 +729,6 @@ class TapasLMPredictionHead(nn.Module):
|
|||
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def _tie_weights(self):
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states = self.transform(hidden_states)
|
||||
hidden_states = self.decoder(hidden_states)
|
||||
|
@ -1011,7 +1008,6 @@ class TapasForMaskedLM(TapasPreTrainedModel):
|
|||
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.cls.predictions.decoder = new_embeddings
|
||||
self.cls.predictions.bias = new_embeddings.bias
|
||||
|
||||
@add_start_docstrings_to_model_forward(TAPAS_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
|
||||
|
|
|
@ -896,7 +896,6 @@ class ViltForMaskedLM(ViltPreTrainedModel):
|
|||
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.mlm_score.decoder = new_embeddings
|
||||
self.mlm_score.bias = new_embeddings.bias
|
||||
|
||||
@add_start_docstrings_to_model_forward(VILT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
|
||||
|
@ -1043,9 +1042,6 @@ class ViltMLMHead(nn.Module):
|
|||
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def _tie_weights(self):
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def forward(self, x):
|
||||
x = self.transform(x)
|
||||
x = self.decoder(x)
|
||||
|
|
|
@ -499,9 +499,6 @@ class VisualBertLMPredictionHead(nn.Module):
|
|||
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def _tie_weights(self):
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states = self.transform(hidden_states)
|
||||
hidden_states = self.decoder(hidden_states)
|
||||
|
@ -882,7 +879,6 @@ class VisualBertForPreTraining(VisualBertPreTrainedModel):
|
|||
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.cls.predictions.decoder = new_embeddings
|
||||
self.cls.predictions.bias = new_embeddings.bias
|
||||
|
||||
@add_start_docstrings_to_model_forward(VISUAL_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@replace_return_docstrings(output_type=VisualBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
|
||||
|
|
|
@ -626,9 +626,6 @@ class YosoLMPredictionHead(nn.Module):
|
|||
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def _tie_weights(self):
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states = self.transform(hidden_states)
|
||||
hidden_states = self.decoder(hidden_states)
|
||||
|
@ -867,7 +864,6 @@ class YosoForMaskedLM(YosoPreTrainedModel):
|
|||
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.cls.predictions.decoder = new_embeddings
|
||||
self.cls.predictions.bias = new_embeddings.bias
|
||||
|
||||
@add_start_docstrings_to_model_forward(YOSO_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@add_code_sample_docstrings(
|
||||
|
|
|
@ -305,12 +305,6 @@ class BertGenerationEncoderTest(ModelTesterMixin, GenerationTesterMixin, Pipelin
|
|||
model = BertGenerationEncoder.from_pretrained("google/bert_for_seq_generation_L-24_bbc_encoder")
|
||||
self.assertIsNotNone(model)
|
||||
|
||||
@unittest.skip(
|
||||
"Not currently compatible. Fails with - NotImplementedError: Cannot copy out of meta tensor; no data!"
|
||||
)
|
||||
def test_save_load_low_cpu_mem_usage(self):
|
||||
pass
|
||||
|
||||
|
||||
@require_torch
|
||||
class BertGenerationEncoderIntegrationTest(unittest.TestCase):
|
||||
|
|
|
@ -564,10 +564,6 @@ class DeformableDetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineT
|
|||
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
|
||||
)
|
||||
|
||||
@unittest.skip("Cannot be initialized on meta device as some weights are modified during the initialization")
|
||||
def test_save_load_low_cpu_mem_usage(self):
|
||||
pass
|
||||
|
||||
def test_two_stage_training(self):
|
||||
model_class = DeformableDetrForObjectDetection
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
|
|
@ -520,10 +520,6 @@ class DetaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
|
|||
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
|
||||
)
|
||||
|
||||
@unittest.skip("Cannot be initialized on meta device as some weights are modified during the initialization")
|
||||
def test_save_load_low_cpu_mem_usage(self):
|
||||
pass
|
||||
|
||||
|
||||
TOLERANCE = 1e-4
|
||||
|
||||
|
|
|
@ -329,12 +329,6 @@ class FSMTModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
|
|||
def test_resize_embeddings_untied(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(
|
||||
"Not currently compatible. Fails with - NotImplementedError: Cannot copy out of meta tensor; no data!"
|
||||
)
|
||||
def test_save_load_low_cpu_mem_usage(self):
|
||||
pass
|
||||
|
||||
|
||||
@require_torch
|
||||
class FSMTHeadTests(unittest.TestCase):
|
||||
|
|
|
@ -372,12 +372,6 @@ class MarianModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
|
|||
def test_training_gradient_checkpointing_use_reentrant_false(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(
|
||||
"Not currently compatible. Fails with - NotImplementedError: Cannot copy out of meta tensor; no data!"
|
||||
)
|
||||
def test_save_load_low_cpu_mem_usage(self):
|
||||
pass
|
||||
|
||||
|
||||
def assert_tensors_close(a, b, atol=1e-12, prefix=""):
|
||||
"""If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
|
||||
|
|
|
@ -1144,10 +1144,6 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
|
|||
|
||||
self.assertNotIn(config.pad_token_id, output_generate)
|
||||
|
||||
@unittest.skip("Fails with - TypeError: _weight_norm_interface() missing 1 required positional argument: 'dim'")
|
||||
def test_save_load_low_cpu_mem_usage(self):
|
||||
pass
|
||||
|
||||
|
||||
def get_bip_bip(bip_duration=0.125, duration=0.5, sample_rate=32000):
|
||||
"""Produces a series of 'bip bip' sounds at a given frequency."""
|
||||
|
|
|
@ -687,12 +687,6 @@ class ReformerLocalAttnModelTest(ReformerTesterMixin, GenerationTesterMixin, Mod
|
|||
def test_left_padding_compatibility(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(
|
||||
"Not currently compatible. Fails with - NotImplementedError: Cannot copy out of meta tensor; no data!"
|
||||
)
|
||||
def test_save_load_low_cpu_mem_usage(self):
|
||||
pass
|
||||
|
||||
|
||||
@require_torch
|
||||
class ReformerLSHAttnModelTest(
|
||||
|
@ -854,12 +848,6 @@ class ReformerLSHAttnModelTest(
|
|||
def test_left_padding_compatibility(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(
|
||||
"Not currently compatible. Fails with - NotImplementedError: Cannot copy out of meta tensor; no data!"
|
||||
)
|
||||
def test_save_load_low_cpu_mem_usage(self):
|
||||
pass
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_sentencepiece
|
||||
|
|
|
@ -515,12 +515,6 @@ class XLMRobertaXLModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTes
|
|||
self.assertEqual(position_ids.shape, expected_positions.shape)
|
||||
self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
|
||||
|
||||
@unittest.skip(
|
||||
"Not currently compatible. Fails with - NotImplementedError: Cannot copy out of meta tensor; no data!"
|
||||
)
|
||||
def test_save_load_low_cpu_mem_usage(self):
|
||||
pass
|
||||
|
||||
|
||||
@require_torch
|
||||
class XLMRobertaModelXLIntegrationTest(unittest.TestCase):
|
||||
|
|
|
@ -435,23 +435,6 @@ class ModelTesterMixin:
|
|||
max_diff = (model_slow_init.state_dict()[key] - model_fast_init.state_dict()[key]).sum().item()
|
||||
self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
|
||||
|
||||
def test_save_load_low_cpu_mem_usage(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
for model_class in self.all_model_classes:
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
model_to_save = model_class(config)
|
||||
|
||||
model_to_save.save_pretrained(tmpdirname)
|
||||
|
||||
model = model_class.from_pretrained(
|
||||
tmpdirname,
|
||||
low_cpu_mem_usage=True,
|
||||
)
|
||||
|
||||
# The low_cpu_mem_usage=True causes the model params to be initialized with device=meta. If there are
|
||||
# any unloaded or untied parameters, then trying to move it to device=torch_device will throw an error.
|
||||
model.to(torch_device)
|
||||
|
||||
def test_fast_init_context_manager(self):
|
||||
# 1. Create a dummy class. Should have buffers as well? To make sure we test __init__
|
||||
class MyClass(PreTrainedModel):
|
||||
|
|
Loading…
Reference in New Issue