Delete all mentions of Model2Model (#3019)
This commit is contained in:
parent
bb7c468520
commit
9df74b8bc4
|
@ -220,96 +220,3 @@ print(sequence)
|
|||
```
|
||||
|
||||
The model only requires a single token as input as all the previous tokens' key/value pairs are contained in the `past`.
|
||||
|
||||
### Model2Model example
|
||||
|
||||
Encoder-decoder architectures require two tokenized inputs: one for the encoder and the other one for the decoder. Let's assume that we want to use `Model2Model` for generative question answering, and start by tokenizing the question and answer that will be fed to the model.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from transformers import BertTokenizer, Model2Model
|
||||
|
||||
# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
|
||||
import logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
# Load pre-trained model tokenizer (vocabulary)
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
|
||||
# Encode the input to the encoder (the question)
|
||||
question = "Who was Jim Henson?"
|
||||
encoded_question = tokenizer.encode(question)
|
||||
|
||||
# Encode the input to the decoder (the answer)
|
||||
answer = "Jim Henson was a puppeteer"
|
||||
encoded_answer = tokenizer.encode(answer)
|
||||
|
||||
# Convert inputs to PyTorch tensors
|
||||
question_tensor = torch.tensor([encoded_question])
|
||||
answer_tensor = torch.tensor([encoded_answer])
|
||||
```
|
||||
|
||||
Let's see how we can use `Model2Model` to get the value of the loss associated with this (question, answer) pair:
|
||||
|
||||
```python
|
||||
# In order to compute the loss we need to provide language model
|
||||
# labels (the token ids that the model should have produced) to
|
||||
# the decoder.
|
||||
lm_labels = encoded_answer
|
||||
labels_tensor = torch.tensor([lm_labels])
|
||||
|
||||
# Load pre-trained model (weights)
|
||||
model = Model2Model.from_pretrained('bert-base-uncased')
|
||||
|
||||
# Set the model in evaluation mode to deactivate the DropOut modules
|
||||
# This is IMPORTANT to have reproducible results during evaluation!
|
||||
model.eval()
|
||||
|
||||
# If you have a GPU, put everything on cuda
|
||||
question_tensor = question_tensor.to('cuda')
|
||||
answer_tensor = answer_tensor.to('cuda')
|
||||
labels_tensor = labels_tensor.to('cuda')
|
||||
model.to('cuda')
|
||||
|
||||
# Predict hidden states features for each layer
|
||||
with torch.no_grad():
|
||||
# See the models docstrings for the detail of the inputs
|
||||
outputs = model(question_tensor, answer_tensor, decoder_lm_labels=labels_tensor)
|
||||
# Transformers models always output tuples.
|
||||
# See the models docstrings for the detail of all the outputs
|
||||
# In our case, the first element is the value of the LM loss
|
||||
lm_loss = outputs[0]
|
||||
```
|
||||
|
||||
This loss can be used to fine-tune `Model2Model` on the question answering task. Assuming that we fine-tuned the model, let us now see how to generate an answer:
|
||||
|
||||
```python
|
||||
# Let's re-use the previous question
|
||||
question = "Who was Jim Henson?"
|
||||
encoded_question = tokenizer.encode(question)
|
||||
question_tensor = torch.tensor([encoded_question])
|
||||
|
||||
# This time we try to generate the answer, so we start with an empty sequence
|
||||
answer = "[CLS]"
|
||||
encoded_answer = tokenizer.encode(answer, add_special_tokens=False)
|
||||
answer_tensor = torch.tensor([encoded_answer])
|
||||
|
||||
# Load pre-trained model (weights)
|
||||
model = Model2Model.from_pretrained('fine-tuned-weights')
|
||||
model.eval()
|
||||
|
||||
# If you have a GPU, put everything on cuda
|
||||
question_tensor = question_tensor.to('cuda')
|
||||
answer_tensor = answer_tensor.to('cuda')
|
||||
model.to('cuda')
|
||||
|
||||
# Predict all tokens
|
||||
with torch.no_grad():
|
||||
outputs = model(question_tensor, answer_tensor)
|
||||
predictions = outputs[0]
|
||||
|
||||
# confirm we were able to predict 'jim'
|
||||
predicted_index = torch.argmax(predictions[0, -1]).item()
|
||||
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
|
||||
assert predicted_token == 'jim'
|
||||
```
|
||||
|
|
|
@ -241,7 +241,7 @@ if is_torch_available():
|
|||
CamembertForTokenClassification,
|
||||
CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
)
|
||||
from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model
|
||||
from .modeling_encoder_decoder import PreTrainedEncoderDecoder
|
||||
from .modeling_t5 import (
|
||||
T5PreTrainedModel,
|
||||
T5Model,
|
||||
|
|
|
@ -234,62 +234,3 @@ class PreTrainedEncoderDecoder(nn.Module):
|
|||
decoder_outputs = self.decoder(decoder_input_ids, **kwargs_decoder)
|
||||
|
||||
return decoder_outputs + encoder_outputs
|
||||
|
||||
|
||||
class Model2Model(PreTrainedEncoderDecoder):
|
||||
r"""
|
||||
:class:`~transformers.Model2Model` instantiates a Seq2Seq2 model
|
||||
where both of the encoder and decoder are of the same family. If the
|
||||
name of or that path to a pretrained model is specified the encoder and
|
||||
the decoder will be initialized with the pretrained weight (the
|
||||
cross-attention will be intialized randomly if its weights are not
|
||||
present).
|
||||
|
||||
It is possible to override this behavior and initialize, say, the decoder randomly
|
||||
by creating it beforehand as follows
|
||||
|
||||
config = BertConfig.from_pretrained()
|
||||
decoder = BertForMaskedLM(config)
|
||||
model = Model2Model.from_pretrained('bert-base-uncased', decoder_model=decoder)
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.tie_weights()
|
||||
|
||||
def tie_weights(self):
|
||||
""" Tying the encoder and decoders' embeddings together.
|
||||
|
||||
We need for each to get down to the embedding weights. However the
|
||||
different model classes are inconsistent to that respect:
|
||||
- BertModel: embeddings.word_embeddings
|
||||
- RoBERTa: embeddings.word_embeddings
|
||||
- XLMModel: embeddings
|
||||
- GPT2: wte
|
||||
- BertForMaskedLM: bert.embeddings.word_embeddings
|
||||
- RobertaForMaskedLM: roberta.embeddings.word_embeddings
|
||||
|
||||
argument of the XEmbedding layer for each model, but it is "blocked"
|
||||
by a model-specific keyword (bert, )...
|
||||
"""
|
||||
# self._tie_or_clone_weights(self.encoder, self.decoder)
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
|
||||
|
||||
if (
|
||||
"bert" not in pretrained_model_name_or_path
|
||||
or "roberta" in pretrained_model_name_or_path
|
||||
or "distilbert" in pretrained_model_name_or_path
|
||||
):
|
||||
raise ValueError("Only the Bert model is currently supported.")
|
||||
|
||||
model = super().from_pretrained(
|
||||
encoder_pretrained_model_name_or_path=pretrained_model_name_or_path,
|
||||
decoder_pretrained_model_name_or_path=pretrained_model_name_or_path,
|
||||
*args,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
return model
|
||||
|
|
|
@ -1,50 +0,0 @@
|
|||
# coding=utf-8
|
||||
# Copyright 2018 The Hugging Face Inc. Team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
from transformers import is_torch_available
|
||||
|
||||
from .utils import require_torch, slow
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
from transformers import BertModel, BertForMaskedLM, Model2Model
|
||||
from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
|
||||
|
||||
@require_torch
|
||||
class EncoderDecoderModelTest(unittest.TestCase):
|
||||
@slow
|
||||
def test_model2model_from_pretrained(self):
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
model = Model2Model.from_pretrained(model_name)
|
||||
self.assertIsInstance(model.encoder, BertModel)
|
||||
self.assertIsInstance(model.decoder, BertForMaskedLM)
|
||||
self.assertEqual(model.decoder.config.is_decoder, True)
|
||||
self.assertEqual(model.encoder.config.is_decoder, False)
|
||||
|
||||
def test_model2model_from_pretrained_not_bert(self):
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
with self.assertRaises(ValueError):
|
||||
_ = Model2Model.from_pretrained("roberta")
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
_ = Model2Model.from_pretrained("distilbert")
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
_ = Model2Model.from_pretrained("does-not-exist")
|
Loading…
Reference in New Issue