[cleanup] Hoist ModelTester objects to top level (#4939)
Co-authored-by: Sam Shleifer <sshleifer@gmail.com>
This commit is contained in:
parent
0c55a384f8
commit
c852036b4a
|
@ -37,6 +37,226 @@ if is_torch_available():
|
||||||
from transformers.modeling_albert import ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST
|
from transformers.modeling_albert import ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST
|
||||||
|
|
||||||
|
|
||||||
|
class AlbertModelTester:
|
||||||
|
def __init__(
|
||||||
|
self, parent,
|
||||||
|
):
|
||||||
|
self.parent = parent
|
||||||
|
self.batch_size = 13
|
||||||
|
self.seq_length = 7
|
||||||
|
self.is_training = True
|
||||||
|
self.use_input_mask = True
|
||||||
|
self.use_token_type_ids = True
|
||||||
|
self.use_labels = True
|
||||||
|
self.vocab_size = 99
|
||||||
|
self.embedding_size = 16
|
||||||
|
self.hidden_size = 36
|
||||||
|
self.num_hidden_layers = 6
|
||||||
|
self.num_hidden_groups = 6
|
||||||
|
self.num_attention_heads = 6
|
||||||
|
self.intermediate_size = 37
|
||||||
|
self.hidden_act = "gelu"
|
||||||
|
self.hidden_dropout_prob = 0.1
|
||||||
|
self.attention_probs_dropout_prob = 0.1
|
||||||
|
self.max_position_embeddings = 512
|
||||||
|
self.type_vocab_size = 16
|
||||||
|
self.type_sequence_label_size = 2
|
||||||
|
self.initializer_range = 0.02
|
||||||
|
self.num_labels = 3
|
||||||
|
self.num_choices = 4
|
||||||
|
self.scope = None
|
||||||
|
|
||||||
|
def prepare_config_and_inputs(self):
|
||||||
|
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
|
||||||
|
input_mask = None
|
||||||
|
if self.use_input_mask:
|
||||||
|
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
|
||||||
|
|
||||||
|
token_type_ids = None
|
||||||
|
if self.use_token_type_ids:
|
||||||
|
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
||||||
|
|
||||||
|
sequence_labels = None
|
||||||
|
token_labels = None
|
||||||
|
choice_labels = None
|
||||||
|
if self.use_labels:
|
||||||
|
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||||
|
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
||||||
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
|
config = AlbertConfig(
|
||||||
|
vocab_size=self.vocab_size,
|
||||||
|
hidden_size=self.hidden_size,
|
||||||
|
num_hidden_layers=self.num_hidden_layers,
|
||||||
|
num_attention_heads=self.num_attention_heads,
|
||||||
|
intermediate_size=self.intermediate_size,
|
||||||
|
hidden_act=self.hidden_act,
|
||||||
|
hidden_dropout_prob=self.hidden_dropout_prob,
|
||||||
|
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
||||||
|
max_position_embeddings=self.max_position_embeddings,
|
||||||
|
type_vocab_size=self.type_vocab_size,
|
||||||
|
initializer_range=self.initializer_range,
|
||||||
|
num_hidden_groups=self.num_hidden_groups,
|
||||||
|
)
|
||||||
|
|
||||||
|
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
|
||||||
|
def check_loss_output(self, result):
|
||||||
|
self.parent.assertListEqual(list(result["loss"].size()), [])
|
||||||
|
|
||||||
|
def create_and_check_albert_model(
|
||||||
|
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
):
|
||||||
|
model = AlbertModel(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
|
||||||
|
sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
|
||||||
|
sequence_output, pooled_output = model(input_ids)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"sequence_output": sequence_output,
|
||||||
|
"pooled_output": pooled_output,
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
|
||||||
|
|
||||||
|
def create_and_check_albert_for_pretraining(
|
||||||
|
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
):
|
||||||
|
model = AlbertForPreTraining(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
loss, prediction_scores, sop_scores = model(
|
||||||
|
input_ids,
|
||||||
|
attention_mask=input_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
labels=token_labels,
|
||||||
|
sentence_order_label=sequence_labels,
|
||||||
|
)
|
||||||
|
result = {
|
||||||
|
"loss": loss,
|
||||||
|
"prediction_scores": prediction_scores,
|
||||||
|
"sop_scores": sop_scores,
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(list(result["sop_scores"].size()), [self.batch_size, config.num_labels])
|
||||||
|
self.check_loss_output(result)
|
||||||
|
|
||||||
|
def create_and_check_albert_for_masked_lm(
|
||||||
|
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
):
|
||||||
|
model = AlbertForMaskedLM(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
loss, prediction_scores = model(
|
||||||
|
input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels
|
||||||
|
)
|
||||||
|
result = {
|
||||||
|
"loss": loss,
|
||||||
|
"prediction_scores": prediction_scores,
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
|
||||||
|
)
|
||||||
|
self.check_loss_output(result)
|
||||||
|
|
||||||
|
def create_and_check_albert_for_question_answering(
|
||||||
|
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
):
|
||||||
|
model = AlbertForQuestionAnswering(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
loss, start_logits, end_logits = model(
|
||||||
|
input_ids,
|
||||||
|
attention_mask=input_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
start_positions=sequence_labels,
|
||||||
|
end_positions=sequence_labels,
|
||||||
|
)
|
||||||
|
result = {
|
||||||
|
"loss": loss,
|
||||||
|
"start_logits": start_logits,
|
||||||
|
"end_logits": end_logits,
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length])
|
||||||
|
self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length])
|
||||||
|
self.check_loss_output(result)
|
||||||
|
|
||||||
|
def create_and_check_albert_for_sequence_classification(
|
||||||
|
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
):
|
||||||
|
config.num_labels = self.num_labels
|
||||||
|
model = AlbertForSequenceClassification(config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
loss, logits = model(
|
||||||
|
input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels
|
||||||
|
)
|
||||||
|
result = {
|
||||||
|
"loss": loss,
|
||||||
|
"logits": logits,
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels])
|
||||||
|
self.check_loss_output(result)
|
||||||
|
|
||||||
|
def create_and_check_albert_for_token_classification(
|
||||||
|
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
):
|
||||||
|
config.num_labels = self.num_labels
|
||||||
|
model = AlbertForTokenClassification(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
|
||||||
|
result = {
|
||||||
|
"loss": loss,
|
||||||
|
"logits": logits,
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels])
|
||||||
|
self.check_loss_output(result)
|
||||||
|
|
||||||
|
def create_and_check_albert_for_multiple_choice(
|
||||||
|
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
):
|
||||||
|
config.num_choices = self.num_choices
|
||||||
|
model = AlbertForMultipleChoice(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
|
||||||
|
multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
|
||||||
|
multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
|
||||||
|
loss, logits = model(
|
||||||
|
multiple_choice_inputs_ids,
|
||||||
|
attention_mask=multiple_choice_input_mask,
|
||||||
|
token_type_ids=multiple_choice_token_type_ids,
|
||||||
|
labels=choice_labels,
|
||||||
|
)
|
||||||
|
result = {
|
||||||
|
"loss": loss,
|
||||||
|
"logits": logits,
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices])
|
||||||
|
|
||||||
|
def prepare_config_and_inputs_for_common(self):
|
||||||
|
config_and_inputs = self.prepare_config_and_inputs()
|
||||||
|
(
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
token_type_ids,
|
||||||
|
input_mask,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
choice_labels,
|
||||||
|
) = config_and_inputs
|
||||||
|
inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
|
||||||
|
return config, inputs_dict
|
||||||
|
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
class AlbertModelTest(ModelTesterMixin, unittest.TestCase):
|
class AlbertModelTest(ModelTesterMixin, unittest.TestCase):
|
||||||
|
|
||||||
|
@ -54,256 +274,8 @@ class AlbertModelTest(ModelTesterMixin, unittest.TestCase):
|
||||||
else ()
|
else ()
|
||||||
)
|
)
|
||||||
|
|
||||||
class AlbertModelTester(object):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
parent,
|
|
||||||
batch_size=13,
|
|
||||||
seq_length=7,
|
|
||||||
is_training=True,
|
|
||||||
use_input_mask=True,
|
|
||||||
use_token_type_ids=True,
|
|
||||||
use_labels=True,
|
|
||||||
vocab_size=99,
|
|
||||||
embedding_size=16,
|
|
||||||
hidden_size=36,
|
|
||||||
num_hidden_layers=6,
|
|
||||||
num_hidden_groups=6,
|
|
||||||
num_attention_heads=6,
|
|
||||||
intermediate_size=37,
|
|
||||||
hidden_act="gelu",
|
|
||||||
hidden_dropout_prob=0.1,
|
|
||||||
attention_probs_dropout_prob=0.1,
|
|
||||||
max_position_embeddings=512,
|
|
||||||
type_vocab_size=16,
|
|
||||||
type_sequence_label_size=2,
|
|
||||||
initializer_range=0.02,
|
|
||||||
num_labels=3,
|
|
||||||
num_choices=4,
|
|
||||||
scope=None,
|
|
||||||
):
|
|
||||||
self.parent = parent
|
|
||||||
self.batch_size = batch_size
|
|
||||||
self.seq_length = seq_length
|
|
||||||
self.is_training = is_training
|
|
||||||
self.use_input_mask = use_input_mask
|
|
||||||
self.use_token_type_ids = use_token_type_ids
|
|
||||||
self.use_labels = use_labels
|
|
||||||
self.vocab_size = vocab_size
|
|
||||||
self.embedding_size = embedding_size
|
|
||||||
self.hidden_size = hidden_size
|
|
||||||
self.num_hidden_layers = num_hidden_layers
|
|
||||||
self.num_attention_heads = num_attention_heads
|
|
||||||
self.intermediate_size = intermediate_size
|
|
||||||
self.hidden_act = hidden_act
|
|
||||||
self.hidden_dropout_prob = hidden_dropout_prob
|
|
||||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
|
||||||
self.max_position_embeddings = max_position_embeddings
|
|
||||||
self.type_vocab_size = type_vocab_size
|
|
||||||
self.type_sequence_label_size = type_sequence_label_size
|
|
||||||
self.initializer_range = initializer_range
|
|
||||||
self.num_labels = num_labels
|
|
||||||
self.num_choices = num_choices
|
|
||||||
self.scope = scope
|
|
||||||
self.num_hidden_groups = num_hidden_groups
|
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
|
||||||
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
|
||||||
|
|
||||||
input_mask = None
|
|
||||||
if self.use_input_mask:
|
|
||||||
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
|
|
||||||
|
|
||||||
token_type_ids = None
|
|
||||||
if self.use_token_type_ids:
|
|
||||||
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
|
||||||
|
|
||||||
sequence_labels = None
|
|
||||||
token_labels = None
|
|
||||||
choice_labels = None
|
|
||||||
if self.use_labels:
|
|
||||||
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
|
||||||
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
|
||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
|
||||||
|
|
||||||
config = AlbertConfig(
|
|
||||||
vocab_size=self.vocab_size,
|
|
||||||
hidden_size=self.hidden_size,
|
|
||||||
num_hidden_layers=self.num_hidden_layers,
|
|
||||||
num_attention_heads=self.num_attention_heads,
|
|
||||||
intermediate_size=self.intermediate_size,
|
|
||||||
hidden_act=self.hidden_act,
|
|
||||||
hidden_dropout_prob=self.hidden_dropout_prob,
|
|
||||||
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
|
||||||
max_position_embeddings=self.max_position_embeddings,
|
|
||||||
type_vocab_size=self.type_vocab_size,
|
|
||||||
initializer_range=self.initializer_range,
|
|
||||||
num_hidden_groups=self.num_hidden_groups,
|
|
||||||
)
|
|
||||||
|
|
||||||
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
|
|
||||||
def check_loss_output(self, result):
|
|
||||||
self.parent.assertListEqual(list(result["loss"].size()), [])
|
|
||||||
|
|
||||||
def create_and_check_albert_model(
|
|
||||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
):
|
|
||||||
model = AlbertModel(config=config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
|
|
||||||
sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
|
|
||||||
sequence_output, pooled_output = model(input_ids)
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"sequence_output": sequence_output,
|
|
||||||
"pooled_output": pooled_output,
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
|
|
||||||
|
|
||||||
def create_and_check_albert_for_pretraining(
|
|
||||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
):
|
|
||||||
model = AlbertForPreTraining(config=config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
loss, prediction_scores, sop_scores = model(
|
|
||||||
input_ids,
|
|
||||||
attention_mask=input_mask,
|
|
||||||
token_type_ids=token_type_ids,
|
|
||||||
labels=token_labels,
|
|
||||||
sentence_order_label=sequence_labels,
|
|
||||||
)
|
|
||||||
result = {
|
|
||||||
"loss": loss,
|
|
||||||
"prediction_scores": prediction_scores,
|
|
||||||
"sop_scores": sop_scores,
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(list(result["sop_scores"].size()), [self.batch_size, config.num_labels])
|
|
||||||
self.check_loss_output(result)
|
|
||||||
|
|
||||||
def create_and_check_albert_for_masked_lm(
|
|
||||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
):
|
|
||||||
model = AlbertForMaskedLM(config=config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
loss, prediction_scores = model(
|
|
||||||
input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels
|
|
||||||
)
|
|
||||||
result = {
|
|
||||||
"loss": loss,
|
|
||||||
"prediction_scores": prediction_scores,
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
|
|
||||||
)
|
|
||||||
self.check_loss_output(result)
|
|
||||||
|
|
||||||
def create_and_check_albert_for_question_answering(
|
|
||||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
):
|
|
||||||
model = AlbertForQuestionAnswering(config=config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
loss, start_logits, end_logits = model(
|
|
||||||
input_ids,
|
|
||||||
attention_mask=input_mask,
|
|
||||||
token_type_ids=token_type_ids,
|
|
||||||
start_positions=sequence_labels,
|
|
||||||
end_positions=sequence_labels,
|
|
||||||
)
|
|
||||||
result = {
|
|
||||||
"loss": loss,
|
|
||||||
"start_logits": start_logits,
|
|
||||||
"end_logits": end_logits,
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length])
|
|
||||||
self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length])
|
|
||||||
self.check_loss_output(result)
|
|
||||||
|
|
||||||
def create_and_check_albert_for_sequence_classification(
|
|
||||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
):
|
|
||||||
config.num_labels = self.num_labels
|
|
||||||
model = AlbertForSequenceClassification(config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
loss, logits = model(
|
|
||||||
input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels
|
|
||||||
)
|
|
||||||
result = {
|
|
||||||
"loss": loss,
|
|
||||||
"logits": logits,
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels])
|
|
||||||
self.check_loss_output(result)
|
|
||||||
|
|
||||||
def create_and_check_albert_for_token_classification(
|
|
||||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
):
|
|
||||||
config.num_labels = self.num_labels
|
|
||||||
model = AlbertForTokenClassification(config=config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
loss, logits = model(
|
|
||||||
input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels
|
|
||||||
)
|
|
||||||
result = {
|
|
||||||
"loss": loss,
|
|
||||||
"logits": logits,
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]
|
|
||||||
)
|
|
||||||
self.check_loss_output(result)
|
|
||||||
|
|
||||||
def create_and_check_albert_for_multiple_choice(
|
|
||||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
):
|
|
||||||
config.num_choices = self.num_choices
|
|
||||||
model = AlbertForMultipleChoice(config=config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
|
|
||||||
multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
|
|
||||||
multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
|
|
||||||
loss, logits = model(
|
|
||||||
multiple_choice_inputs_ids,
|
|
||||||
attention_mask=multiple_choice_input_mask,
|
|
||||||
token_type_ids=multiple_choice_token_type_ids,
|
|
||||||
labels=choice_labels,
|
|
||||||
)
|
|
||||||
result = {
|
|
||||||
"loss": loss,
|
|
||||||
"logits": logits,
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices])
|
|
||||||
self.check_loss_output(result)
|
|
||||||
|
|
||||||
def prepare_config_and_inputs_for_common(self):
|
|
||||||
config_and_inputs = self.prepare_config_and_inputs()
|
|
||||||
(
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
token_type_ids,
|
|
||||||
input_mask,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
choice_labels,
|
|
||||||
) = config_and_inputs
|
|
||||||
inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
|
|
||||||
return config, inputs_dict
|
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = AlbertModelTest.AlbertModelTester(self)
|
self.model_tester = AlbertModelTester(self)
|
||||||
self.config_tester = ConfigTester(self, config_class=AlbertConfig, hidden_size=37)
|
self.config_tester = ConfigTester(self, config_class=AlbertConfig, hidden_size=37)
|
||||||
|
|
||||||
def test_config(self):
|
def test_config(self):
|
||||||
|
|
|
@ -27,6 +27,140 @@ if is_torch_available():
|
||||||
from transformers import CTRLConfig, CTRLModel, CTRL_PRETRAINED_MODEL_ARCHIVE_LIST, CTRLLMHeadModel
|
from transformers import CTRLConfig, CTRLModel, CTRL_PRETRAINED_MODEL_ARCHIVE_LIST, CTRLLMHeadModel
|
||||||
|
|
||||||
|
|
||||||
|
class CTRLModelTester:
|
||||||
|
def __init__(
|
||||||
|
self, parent,
|
||||||
|
):
|
||||||
|
self.parent = parent
|
||||||
|
self.batch_size = 14
|
||||||
|
self.seq_length = 7
|
||||||
|
self.is_training = True
|
||||||
|
self.use_token_type_ids = True
|
||||||
|
self.use_input_mask = True
|
||||||
|
self.use_labels = True
|
||||||
|
self.use_mc_token_ids = True
|
||||||
|
self.vocab_size = 99
|
||||||
|
self.hidden_size = 32
|
||||||
|
self.num_hidden_layers = 5
|
||||||
|
self.num_attention_heads = 4
|
||||||
|
self.intermediate_size = 37
|
||||||
|
self.hidden_act = "gelu"
|
||||||
|
self.hidden_dropout_prob = 0.1
|
||||||
|
self.attention_probs_dropout_prob = 0.1
|
||||||
|
self.max_position_embeddings = 512
|
||||||
|
self.type_vocab_size = 16
|
||||||
|
self.type_sequence_label_size = 2
|
||||||
|
self.initializer_range = 0.02
|
||||||
|
self.num_labels = 3
|
||||||
|
self.num_choices = 4
|
||||||
|
self.scope = None
|
||||||
|
|
||||||
|
def prepare_config_and_inputs(self):
|
||||||
|
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
|
||||||
|
input_mask = None
|
||||||
|
if self.use_input_mask:
|
||||||
|
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
|
||||||
|
|
||||||
|
token_type_ids = None
|
||||||
|
if self.use_token_type_ids:
|
||||||
|
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
||||||
|
|
||||||
|
mc_token_ids = None
|
||||||
|
if self.use_mc_token_ids:
|
||||||
|
mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
|
||||||
|
|
||||||
|
sequence_labels = None
|
||||||
|
token_labels = None
|
||||||
|
choice_labels = None
|
||||||
|
if self.use_labels:
|
||||||
|
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||||
|
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
||||||
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
|
config = CTRLConfig(
|
||||||
|
vocab_size=self.vocab_size,
|
||||||
|
n_embd=self.hidden_size,
|
||||||
|
n_layer=self.num_hidden_layers,
|
||||||
|
n_head=self.num_attention_heads,
|
||||||
|
# intermediate_size=self.intermediate_size,
|
||||||
|
# hidden_act=self.hidden_act,
|
||||||
|
# hidden_dropout_prob=self.hidden_dropout_prob,
|
||||||
|
# attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
||||||
|
n_positions=self.max_position_embeddings,
|
||||||
|
n_ctx=self.max_position_embeddings
|
||||||
|
# type_vocab_size=self.type_vocab_size,
|
||||||
|
# initializer_range=self.initializer_range
|
||||||
|
)
|
||||||
|
|
||||||
|
head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
|
||||||
|
|
||||||
|
return (
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
input_mask,
|
||||||
|
head_mask,
|
||||||
|
token_type_ids,
|
||||||
|
mc_token_ids,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
choice_labels,
|
||||||
|
)
|
||||||
|
|
||||||
|
def check_loss_output(self, result):
|
||||||
|
self.parent.assertListEqual(list(result["loss"].size()), [])
|
||||||
|
|
||||||
|
def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
||||||
|
model = CTRLModel(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
|
||||||
|
model(input_ids, token_type_ids=token_type_ids)
|
||||||
|
sequence_output, presents = model(input_ids)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"sequence_output": sequence_output,
|
||||||
|
"presents": presents,
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
|
||||||
|
)
|
||||||
|
self.parent.assertEqual(len(result["presents"]), config.n_layer)
|
||||||
|
|
||||||
|
def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
||||||
|
model = CTRLLMHeadModel(config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
|
||||||
|
|
||||||
|
result = {"loss": loss, "lm_logits": lm_logits}
|
||||||
|
self.parent.assertListEqual(list(result["loss"].size()), [])
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]
|
||||||
|
)
|
||||||
|
|
||||||
|
def prepare_config_and_inputs_for_common(self):
|
||||||
|
config_and_inputs = self.prepare_config_and_inputs()
|
||||||
|
|
||||||
|
(
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
input_mask,
|
||||||
|
head_mask,
|
||||||
|
token_type_ids,
|
||||||
|
mc_token_ids,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
choice_labels,
|
||||||
|
) = config_and_inputs
|
||||||
|
|
||||||
|
inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "head_mask": head_mask}
|
||||||
|
|
||||||
|
return config, inputs_dict
|
||||||
|
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
class CTRLModelTest(ModelTesterMixin, unittest.TestCase):
|
class CTRLModelTest(ModelTesterMixin, unittest.TestCase):
|
||||||
|
|
||||||
|
@ -37,164 +171,8 @@ class CTRLModelTest(ModelTesterMixin, unittest.TestCase):
|
||||||
test_resize_embeddings = False
|
test_resize_embeddings = False
|
||||||
test_head_masking = False
|
test_head_masking = False
|
||||||
|
|
||||||
class CTRLModelTester(object):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
parent,
|
|
||||||
batch_size=14,
|
|
||||||
seq_length=7,
|
|
||||||
is_training=True,
|
|
||||||
use_token_type_ids=True,
|
|
||||||
use_input_mask=True,
|
|
||||||
use_labels=True,
|
|
||||||
use_mc_token_ids=True,
|
|
||||||
vocab_size=99,
|
|
||||||
hidden_size=32,
|
|
||||||
num_hidden_layers=5,
|
|
||||||
num_attention_heads=4,
|
|
||||||
intermediate_size=37,
|
|
||||||
hidden_act="gelu",
|
|
||||||
hidden_dropout_prob=0.1,
|
|
||||||
attention_probs_dropout_prob=0.1,
|
|
||||||
max_position_embeddings=512,
|
|
||||||
type_vocab_size=16,
|
|
||||||
type_sequence_label_size=2,
|
|
||||||
initializer_range=0.02,
|
|
||||||
num_labels=3,
|
|
||||||
num_choices=4,
|
|
||||||
scope=None,
|
|
||||||
):
|
|
||||||
self.parent = parent
|
|
||||||
self.batch_size = batch_size
|
|
||||||
self.seq_length = seq_length
|
|
||||||
self.is_training = is_training
|
|
||||||
self.use_token_type_ids = use_token_type_ids
|
|
||||||
self.use_input_mask = use_input_mask
|
|
||||||
self.use_labels = use_labels
|
|
||||||
self.use_mc_token_ids = use_mc_token_ids
|
|
||||||
self.vocab_size = vocab_size
|
|
||||||
self.hidden_size = hidden_size
|
|
||||||
self.num_hidden_layers = num_hidden_layers
|
|
||||||
self.num_attention_heads = num_attention_heads
|
|
||||||
self.intermediate_size = intermediate_size
|
|
||||||
self.hidden_act = hidden_act
|
|
||||||
self.hidden_dropout_prob = hidden_dropout_prob
|
|
||||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
|
||||||
self.max_position_embeddings = max_position_embeddings
|
|
||||||
self.type_vocab_size = type_vocab_size
|
|
||||||
self.type_sequence_label_size = type_sequence_label_size
|
|
||||||
self.initializer_range = initializer_range
|
|
||||||
self.num_labels = num_labels
|
|
||||||
self.num_choices = num_choices
|
|
||||||
self.scope = scope
|
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
|
||||||
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
|
||||||
|
|
||||||
input_mask = None
|
|
||||||
if self.use_input_mask:
|
|
||||||
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
|
|
||||||
|
|
||||||
token_type_ids = None
|
|
||||||
if self.use_token_type_ids:
|
|
||||||
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
|
||||||
|
|
||||||
mc_token_ids = None
|
|
||||||
if self.use_mc_token_ids:
|
|
||||||
mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
|
|
||||||
|
|
||||||
sequence_labels = None
|
|
||||||
token_labels = None
|
|
||||||
choice_labels = None
|
|
||||||
if self.use_labels:
|
|
||||||
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
|
||||||
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
|
||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
|
||||||
|
|
||||||
config = CTRLConfig(
|
|
||||||
vocab_size=self.vocab_size,
|
|
||||||
n_embd=self.hidden_size,
|
|
||||||
n_layer=self.num_hidden_layers,
|
|
||||||
n_head=self.num_attention_heads,
|
|
||||||
# intermediate_size=self.intermediate_size,
|
|
||||||
# hidden_act=self.hidden_act,
|
|
||||||
# hidden_dropout_prob=self.hidden_dropout_prob,
|
|
||||||
# attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
|
||||||
n_positions=self.max_position_embeddings,
|
|
||||||
n_ctx=self.max_position_embeddings
|
|
||||||
# type_vocab_size=self.type_vocab_size,
|
|
||||||
# initializer_range=self.initializer_range
|
|
||||||
)
|
|
||||||
|
|
||||||
head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
|
|
||||||
|
|
||||||
return (
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
input_mask,
|
|
||||||
head_mask,
|
|
||||||
token_type_ids,
|
|
||||||
mc_token_ids,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
choice_labels,
|
|
||||||
)
|
|
||||||
|
|
||||||
def check_loss_output(self, result):
|
|
||||||
self.parent.assertListEqual(list(result["loss"].size()), [])
|
|
||||||
|
|
||||||
def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
|
||||||
model = CTRLModel(config=config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
|
|
||||||
model(input_ids, token_type_ids=token_type_ids)
|
|
||||||
sequence_output, presents = model(input_ids)
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"sequence_output": sequence_output,
|
|
||||||
"presents": presents,
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
|
|
||||||
)
|
|
||||||
self.parent.assertEqual(len(result["presents"]), config.n_layer)
|
|
||||||
|
|
||||||
def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
|
||||||
model = CTRLLMHeadModel(config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
|
|
||||||
|
|
||||||
result = {"loss": loss, "lm_logits": lm_logits}
|
|
||||||
self.parent.assertListEqual(list(result["loss"].size()), [])
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]
|
|
||||||
)
|
|
||||||
|
|
||||||
def prepare_config_and_inputs_for_common(self):
|
|
||||||
config_and_inputs = self.prepare_config_and_inputs()
|
|
||||||
|
|
||||||
(
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
input_mask,
|
|
||||||
head_mask,
|
|
||||||
token_type_ids,
|
|
||||||
mc_token_ids,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
choice_labels,
|
|
||||||
) = config_and_inputs
|
|
||||||
|
|
||||||
inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "head_mask": head_mask}
|
|
||||||
|
|
||||||
return config, inputs_dict
|
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = CTRLModelTest.CTRLModelTester(self)
|
self.model_tester = CTRLModelTester(self)
|
||||||
self.config_tester = ConfigTester(self, config_class=CTRLConfig, n_embd=37)
|
self.config_tester = ConfigTester(self, config_class=CTRLConfig, n_embd=37)
|
||||||
|
|
||||||
def test_config(self):
|
def test_config(self):
|
||||||
|
|
|
@ -34,27 +34,6 @@ if is_torch_available():
|
||||||
DistilBertForSequenceClassification,
|
DistilBertForSequenceClassification,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@require_torch
|
|
||||||
class DistilBertModelTest(ModelTesterMixin, unittest.TestCase):
|
|
||||||
|
|
||||||
all_model_classes = (
|
|
||||||
(
|
|
||||||
DistilBertModel,
|
|
||||||
DistilBertForMaskedLM,
|
|
||||||
DistilBertForMultipleChoice,
|
|
||||||
DistilBertForQuestionAnswering,
|
|
||||||
DistilBertForSequenceClassification,
|
|
||||||
DistilBertForTokenClassification,
|
|
||||||
)
|
|
||||||
if is_torch_available()
|
|
||||||
else None
|
|
||||||
)
|
|
||||||
test_pruning = True
|
|
||||||
test_torchscript = True
|
|
||||||
test_resize_embeddings = True
|
|
||||||
test_head_masking = True
|
|
||||||
|
|
||||||
class DistilBertModelTester(object):
|
class DistilBertModelTester(object):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -245,8 +224,29 @@ class DistilBertModelTest(ModelTesterMixin, unittest.TestCase):
|
||||||
inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
|
inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
|
||||||
return config, inputs_dict
|
return config, inputs_dict
|
||||||
|
|
||||||
|
|
||||||
|
@require_torch
|
||||||
|
class DistilBertModelTest(ModelTesterMixin, unittest.TestCase):
|
||||||
|
|
||||||
|
all_model_classes = (
|
||||||
|
(
|
||||||
|
DistilBertModel,
|
||||||
|
DistilBertForMaskedLM,
|
||||||
|
DistilBertForMultipleChoice,
|
||||||
|
DistilBertForQuestionAnswering,
|
||||||
|
DistilBertForSequenceClassification,
|
||||||
|
DistilBertForTokenClassification,
|
||||||
|
)
|
||||||
|
if is_torch_available()
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
test_pruning = True
|
||||||
|
test_torchscript = True
|
||||||
|
test_resize_embeddings = True
|
||||||
|
test_head_masking = True
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = DistilBertModelTest.DistilBertModelTester(self)
|
self.model_tester = DistilBertModelTester(self)
|
||||||
self.config_tester = ConfigTester(self, config_class=DistilBertConfig, dim=37)
|
self.config_tester = ConfigTester(self, config_class=DistilBertConfig, dim=37)
|
||||||
|
|
||||||
def test_config(self):
|
def test_config(self):
|
||||||
|
|
|
@ -36,6 +36,252 @@ if is_torch_available():
|
||||||
from transformers.modeling_electra import ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST
|
from transformers.modeling_electra import ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST
|
||||||
|
|
||||||
|
|
||||||
|
class ElectraModelTester:
|
||||||
|
def __init__(
|
||||||
|
self, parent,
|
||||||
|
):
|
||||||
|
self.parent = parent
|
||||||
|
self.batch_size = 13
|
||||||
|
self.seq_length = 7
|
||||||
|
self.is_training = True
|
||||||
|
self.use_input_mask = True
|
||||||
|
self.use_token_type_ids = True
|
||||||
|
self.use_labels = True
|
||||||
|
self.vocab_size = 99
|
||||||
|
self.hidden_size = 32
|
||||||
|
self.num_hidden_layers = 5
|
||||||
|
self.num_attention_heads = 4
|
||||||
|
self.intermediate_size = 37
|
||||||
|
self.hidden_act = "gelu"
|
||||||
|
self.hidden_dropout_prob = 0.1
|
||||||
|
self.attention_probs_dropout_prob = 0.1
|
||||||
|
self.max_position_embeddings = 512
|
||||||
|
self.type_vocab_size = 16
|
||||||
|
self.type_sequence_label_size = 2
|
||||||
|
self.initializer_range = 0.02
|
||||||
|
self.num_labels = 3
|
||||||
|
self.num_choices = 4
|
||||||
|
self.scope = None
|
||||||
|
|
||||||
|
def prepare_config_and_inputs(self):
|
||||||
|
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
|
||||||
|
input_mask = None
|
||||||
|
if self.use_input_mask:
|
||||||
|
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
|
||||||
|
|
||||||
|
token_type_ids = None
|
||||||
|
if self.use_token_type_ids:
|
||||||
|
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
||||||
|
|
||||||
|
sequence_labels = None
|
||||||
|
token_labels = None
|
||||||
|
choice_labels = None
|
||||||
|
if self.use_labels:
|
||||||
|
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||||
|
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
||||||
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
fake_token_labels = ids_tensor([self.batch_size, self.seq_length], 1)
|
||||||
|
|
||||||
|
config = ElectraConfig(
|
||||||
|
vocab_size=self.vocab_size,
|
||||||
|
hidden_size=self.hidden_size,
|
||||||
|
num_hidden_layers=self.num_hidden_layers,
|
||||||
|
num_attention_heads=self.num_attention_heads,
|
||||||
|
intermediate_size=self.intermediate_size,
|
||||||
|
hidden_act=self.hidden_act,
|
||||||
|
hidden_dropout_prob=self.hidden_dropout_prob,
|
||||||
|
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
||||||
|
max_position_embeddings=self.max_position_embeddings,
|
||||||
|
type_vocab_size=self.type_vocab_size,
|
||||||
|
is_decoder=False,
|
||||||
|
initializer_range=self.initializer_range,
|
||||||
|
)
|
||||||
|
|
||||||
|
return (
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
token_type_ids,
|
||||||
|
input_mask,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
choice_labels,
|
||||||
|
fake_token_labels,
|
||||||
|
)
|
||||||
|
|
||||||
|
def check_loss_output(self, result):
|
||||||
|
self.parent.assertListEqual(list(result["loss"].size()), [])
|
||||||
|
|
||||||
|
def create_and_check_electra_model(
|
||||||
|
self,
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
token_type_ids,
|
||||||
|
input_mask,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
choice_labels,
|
||||||
|
fake_token_labels,
|
||||||
|
):
|
||||||
|
model = ElectraModel(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
(sequence_output,) = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
|
||||||
|
(sequence_output,) = model(input_ids, token_type_ids=token_type_ids)
|
||||||
|
(sequence_output,) = model(input_ids)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"sequence_output": sequence_output,
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_and_check_electra_for_masked_lm(
|
||||||
|
self,
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
token_type_ids,
|
||||||
|
input_mask,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
choice_labels,
|
||||||
|
fake_token_labels,
|
||||||
|
):
|
||||||
|
model = ElectraForMaskedLM(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
loss, prediction_scores = model(
|
||||||
|
input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels
|
||||||
|
)
|
||||||
|
result = {
|
||||||
|
"loss": loss,
|
||||||
|
"prediction_scores": prediction_scores,
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
|
||||||
|
)
|
||||||
|
self.check_loss_output(result)
|
||||||
|
|
||||||
|
def create_and_check_electra_for_token_classification(
|
||||||
|
self,
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
token_type_ids,
|
||||||
|
input_mask,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
choice_labels,
|
||||||
|
fake_token_labels,
|
||||||
|
):
|
||||||
|
config.num_labels = self.num_labels
|
||||||
|
model = ElectraForTokenClassification(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
|
||||||
|
result = {
|
||||||
|
"loss": loss,
|
||||||
|
"logits": logits,
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels])
|
||||||
|
self.check_loss_output(result)
|
||||||
|
|
||||||
|
def create_and_check_electra_for_pretraining(
|
||||||
|
self,
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
token_type_ids,
|
||||||
|
input_mask,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
choice_labels,
|
||||||
|
fake_token_labels,
|
||||||
|
):
|
||||||
|
config.num_labels = self.num_labels
|
||||||
|
model = ElectraForPreTraining(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
loss, logits = model(
|
||||||
|
input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=fake_token_labels
|
||||||
|
)
|
||||||
|
result = {
|
||||||
|
"loss": loss,
|
||||||
|
"logits": logits,
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length])
|
||||||
|
self.check_loss_output(result)
|
||||||
|
|
||||||
|
def create_and_check_electra_for_sequence_classification(
|
||||||
|
self,
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
token_type_ids,
|
||||||
|
input_mask,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
choice_labels,
|
||||||
|
fake_token_labels,
|
||||||
|
):
|
||||||
|
config.num_labels = self.num_labels
|
||||||
|
model = ElectraForSequenceClassification(config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
loss, logits = model(
|
||||||
|
input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels
|
||||||
|
)
|
||||||
|
result = {
|
||||||
|
"loss": loss,
|
||||||
|
"logits": logits,
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels])
|
||||||
|
self.check_loss_output(result)
|
||||||
|
|
||||||
|
def create_and_check_electra_for_question_answering(
|
||||||
|
self,
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
token_type_ids,
|
||||||
|
input_mask,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
choice_labels,
|
||||||
|
fake_token_labels,
|
||||||
|
):
|
||||||
|
model = ElectraForQuestionAnswering(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
loss, start_logits, end_logits = model(
|
||||||
|
input_ids,
|
||||||
|
attention_mask=input_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
start_positions=sequence_labels,
|
||||||
|
end_positions=sequence_labels,
|
||||||
|
)
|
||||||
|
result = {
|
||||||
|
"loss": loss,
|
||||||
|
"start_logits": start_logits,
|
||||||
|
"end_logits": end_logits,
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length])
|
||||||
|
self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length])
|
||||||
|
self.check_loss_output(result)
|
||||||
|
|
||||||
|
def prepare_config_and_inputs_for_common(self):
|
||||||
|
config_and_inputs = self.prepare_config_and_inputs()
|
||||||
|
(
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
token_type_ids,
|
||||||
|
input_mask,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
choice_labels,
|
||||||
|
fake_token_labels,
|
||||||
|
) = config_and_inputs
|
||||||
|
inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
|
||||||
|
return config, inputs_dict
|
||||||
|
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
class ElectraModelTest(ModelTesterMixin, unittest.TestCase):
|
class ElectraModelTest(ModelTesterMixin, unittest.TestCase):
|
||||||
|
|
||||||
|
@ -52,279 +298,8 @@ class ElectraModelTest(ModelTesterMixin, unittest.TestCase):
|
||||||
else ()
|
else ()
|
||||||
)
|
)
|
||||||
|
|
||||||
class ElectraModelTester(object):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
parent,
|
|
||||||
batch_size=13,
|
|
||||||
seq_length=7,
|
|
||||||
is_training=True,
|
|
||||||
use_input_mask=True,
|
|
||||||
use_token_type_ids=True,
|
|
||||||
use_labels=True,
|
|
||||||
vocab_size=99,
|
|
||||||
hidden_size=32,
|
|
||||||
num_hidden_layers=5,
|
|
||||||
num_attention_heads=4,
|
|
||||||
intermediate_size=37,
|
|
||||||
hidden_act="gelu",
|
|
||||||
hidden_dropout_prob=0.1,
|
|
||||||
attention_probs_dropout_prob=0.1,
|
|
||||||
max_position_embeddings=512,
|
|
||||||
type_vocab_size=16,
|
|
||||||
type_sequence_label_size=2,
|
|
||||||
initializer_range=0.02,
|
|
||||||
num_labels=3,
|
|
||||||
num_choices=4,
|
|
||||||
scope=None,
|
|
||||||
):
|
|
||||||
self.parent = parent
|
|
||||||
self.batch_size = batch_size
|
|
||||||
self.seq_length = seq_length
|
|
||||||
self.is_training = is_training
|
|
||||||
self.use_input_mask = use_input_mask
|
|
||||||
self.use_token_type_ids = use_token_type_ids
|
|
||||||
self.use_labels = use_labels
|
|
||||||
self.vocab_size = vocab_size
|
|
||||||
self.hidden_size = hidden_size
|
|
||||||
self.num_hidden_layers = num_hidden_layers
|
|
||||||
self.num_attention_heads = num_attention_heads
|
|
||||||
self.intermediate_size = intermediate_size
|
|
||||||
self.hidden_act = hidden_act
|
|
||||||
self.hidden_dropout_prob = hidden_dropout_prob
|
|
||||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
|
||||||
self.max_position_embeddings = max_position_embeddings
|
|
||||||
self.type_vocab_size = type_vocab_size
|
|
||||||
self.type_sequence_label_size = type_sequence_label_size
|
|
||||||
self.initializer_range = initializer_range
|
|
||||||
self.num_labels = num_labels
|
|
||||||
self.num_choices = num_choices
|
|
||||||
self.scope = scope
|
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
|
||||||
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
|
||||||
|
|
||||||
input_mask = None
|
|
||||||
if self.use_input_mask:
|
|
||||||
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
|
|
||||||
|
|
||||||
token_type_ids = None
|
|
||||||
if self.use_token_type_ids:
|
|
||||||
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
|
||||||
|
|
||||||
sequence_labels = None
|
|
||||||
token_labels = None
|
|
||||||
choice_labels = None
|
|
||||||
if self.use_labels:
|
|
||||||
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
|
||||||
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
|
||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
|
||||||
fake_token_labels = ids_tensor([self.batch_size, self.seq_length], 1)
|
|
||||||
|
|
||||||
config = ElectraConfig(
|
|
||||||
vocab_size=self.vocab_size,
|
|
||||||
hidden_size=self.hidden_size,
|
|
||||||
num_hidden_layers=self.num_hidden_layers,
|
|
||||||
num_attention_heads=self.num_attention_heads,
|
|
||||||
intermediate_size=self.intermediate_size,
|
|
||||||
hidden_act=self.hidden_act,
|
|
||||||
hidden_dropout_prob=self.hidden_dropout_prob,
|
|
||||||
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
|
||||||
max_position_embeddings=self.max_position_embeddings,
|
|
||||||
type_vocab_size=self.type_vocab_size,
|
|
||||||
is_decoder=False,
|
|
||||||
initializer_range=self.initializer_range,
|
|
||||||
)
|
|
||||||
|
|
||||||
return (
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
token_type_ids,
|
|
||||||
input_mask,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
choice_labels,
|
|
||||||
fake_token_labels,
|
|
||||||
)
|
|
||||||
|
|
||||||
def check_loss_output(self, result):
|
|
||||||
self.parent.assertListEqual(list(result["loss"].size()), [])
|
|
||||||
|
|
||||||
def create_and_check_electra_model(
|
|
||||||
self,
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
token_type_ids,
|
|
||||||
input_mask,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
choice_labels,
|
|
||||||
fake_token_labels,
|
|
||||||
):
|
|
||||||
model = ElectraModel(config=config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
(sequence_output,) = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
|
|
||||||
(sequence_output,) = model(input_ids, token_type_ids=token_type_ids)
|
|
||||||
(sequence_output,) = model(input_ids)
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"sequence_output": sequence_output,
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_electra_for_masked_lm(
|
|
||||||
self,
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
token_type_ids,
|
|
||||||
input_mask,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
choice_labels,
|
|
||||||
fake_token_labels,
|
|
||||||
):
|
|
||||||
model = ElectraForMaskedLM(config=config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
loss, prediction_scores = model(
|
|
||||||
input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels
|
|
||||||
)
|
|
||||||
result = {
|
|
||||||
"loss": loss,
|
|
||||||
"prediction_scores": prediction_scores,
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
|
|
||||||
)
|
|
||||||
self.check_loss_output(result)
|
|
||||||
|
|
||||||
def create_and_check_electra_for_token_classification(
|
|
||||||
self,
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
token_type_ids,
|
|
||||||
input_mask,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
choice_labels,
|
|
||||||
fake_token_labels,
|
|
||||||
):
|
|
||||||
config.num_labels = self.num_labels
|
|
||||||
model = ElectraForTokenClassification(config=config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
loss, logits = model(
|
|
||||||
input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels
|
|
||||||
)
|
|
||||||
result = {
|
|
||||||
"loss": loss,
|
|
||||||
"logits": logits,
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]
|
|
||||||
)
|
|
||||||
self.check_loss_output(result)
|
|
||||||
|
|
||||||
def create_and_check_electra_for_pretraining(
|
|
||||||
self,
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
token_type_ids,
|
|
||||||
input_mask,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
choice_labels,
|
|
||||||
fake_token_labels,
|
|
||||||
):
|
|
||||||
config.num_labels = self.num_labels
|
|
||||||
model = ElectraForPreTraining(config=config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
loss, logits = model(
|
|
||||||
input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=fake_token_labels
|
|
||||||
)
|
|
||||||
result = {
|
|
||||||
"loss": loss,
|
|
||||||
"logits": logits,
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length])
|
|
||||||
self.check_loss_output(result)
|
|
||||||
|
|
||||||
def create_and_check_electra_for_sequence_classification(
|
|
||||||
self,
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
token_type_ids,
|
|
||||||
input_mask,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
choice_labels,
|
|
||||||
fake_token_labels,
|
|
||||||
):
|
|
||||||
config.num_labels = self.num_labels
|
|
||||||
model = ElectraForSequenceClassification(config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
loss, logits = model(
|
|
||||||
input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels
|
|
||||||
)
|
|
||||||
result = {
|
|
||||||
"loss": loss,
|
|
||||||
"logits": logits,
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels])
|
|
||||||
self.check_loss_output(result)
|
|
||||||
|
|
||||||
def create_and_check_electra_for_question_answering(
|
|
||||||
self,
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
token_type_ids,
|
|
||||||
input_mask,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
choice_labels,
|
|
||||||
fake_token_labels,
|
|
||||||
):
|
|
||||||
model = ElectraForQuestionAnswering(config=config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
loss, start_logits, end_logits = model(
|
|
||||||
input_ids,
|
|
||||||
attention_mask=input_mask,
|
|
||||||
token_type_ids=token_type_ids,
|
|
||||||
start_positions=sequence_labels,
|
|
||||||
end_positions=sequence_labels,
|
|
||||||
)
|
|
||||||
result = {
|
|
||||||
"loss": loss,
|
|
||||||
"start_logits": start_logits,
|
|
||||||
"end_logits": end_logits,
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length])
|
|
||||||
self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length])
|
|
||||||
self.check_loss_output(result)
|
|
||||||
|
|
||||||
def prepare_config_and_inputs_for_common(self):
|
|
||||||
config_and_inputs = self.prepare_config_and_inputs()
|
|
||||||
(
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
token_type_ids,
|
|
||||||
input_mask,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
choice_labels,
|
|
||||||
fake_token_labels,
|
|
||||||
) = config_and_inputs
|
|
||||||
inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
|
|
||||||
return config, inputs_dict
|
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = ElectraModelTest.ElectraModelTester(self)
|
self.model_tester = ElectraModelTester(self)
|
||||||
self.config_tester = ConfigTester(self, config_class=ElectraConfig, hidden_size=37)
|
self.config_tester = ConfigTester(self, config_class=ElectraConfig, hidden_size=37)
|
||||||
|
|
||||||
def test_config(self):
|
def test_config(self):
|
||||||
|
|
|
@ -35,6 +35,281 @@ if is_torch_available():
|
||||||
from transformers.modeling_flaubert import FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST
|
from transformers.modeling_flaubert import FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST
|
||||||
|
|
||||||
|
|
||||||
|
class FlaubertModelTester(object):
|
||||||
|
def __init__(
|
||||||
|
self, parent,
|
||||||
|
):
|
||||||
|
self.parent = parent
|
||||||
|
self.batch_size = 13
|
||||||
|
self.seq_length = 7
|
||||||
|
self.is_training = True
|
||||||
|
self.use_input_lengths = True
|
||||||
|
self.use_token_type_ids = True
|
||||||
|
self.use_labels = True
|
||||||
|
self.gelu_activation = True
|
||||||
|
self.sinusoidal_embeddings = False
|
||||||
|
self.causal = False
|
||||||
|
self.asm = False
|
||||||
|
self.n_langs = 2
|
||||||
|
self.vocab_size = 99
|
||||||
|
self.n_special = 0
|
||||||
|
self.hidden_size = 32
|
||||||
|
self.num_hidden_layers = 5
|
||||||
|
self.num_attention_heads = 4
|
||||||
|
self.hidden_dropout_prob = 0.1
|
||||||
|
self.attention_probs_dropout_prob = 0.1
|
||||||
|
self.max_position_embeddings = 512
|
||||||
|
self.type_vocab_size = 12
|
||||||
|
self.type_sequence_label_size = 2
|
||||||
|
self.initializer_range = 0.02
|
||||||
|
self.num_labels = 3
|
||||||
|
self.num_choices = 4
|
||||||
|
self.summary_type = "last"
|
||||||
|
self.use_proj = None
|
||||||
|
self.scope = None
|
||||||
|
|
||||||
|
def prepare_config_and_inputs(self):
|
||||||
|
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
input_mask = ids_tensor([self.batch_size, self.seq_length], 2).float()
|
||||||
|
|
||||||
|
input_lengths = None
|
||||||
|
if self.use_input_lengths:
|
||||||
|
input_lengths = (
|
||||||
|
ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2
|
||||||
|
) # small variation of seq_length
|
||||||
|
|
||||||
|
token_type_ids = None
|
||||||
|
if self.use_token_type_ids:
|
||||||
|
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.n_langs)
|
||||||
|
|
||||||
|
sequence_labels = None
|
||||||
|
token_labels = None
|
||||||
|
is_impossible_labels = None
|
||||||
|
if self.use_labels:
|
||||||
|
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||||
|
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
||||||
|
is_impossible_labels = ids_tensor([self.batch_size], 2).float()
|
||||||
|
|
||||||
|
config = FlaubertConfig(
|
||||||
|
vocab_size=self.vocab_size,
|
||||||
|
n_special=self.n_special,
|
||||||
|
emb_dim=self.hidden_size,
|
||||||
|
n_layers=self.num_hidden_layers,
|
||||||
|
n_heads=self.num_attention_heads,
|
||||||
|
dropout=self.hidden_dropout_prob,
|
||||||
|
attention_dropout=self.attention_probs_dropout_prob,
|
||||||
|
gelu_activation=self.gelu_activation,
|
||||||
|
sinusoidal_embeddings=self.sinusoidal_embeddings,
|
||||||
|
asm=self.asm,
|
||||||
|
causal=self.causal,
|
||||||
|
n_langs=self.n_langs,
|
||||||
|
max_position_embeddings=self.max_position_embeddings,
|
||||||
|
initializer_range=self.initializer_range,
|
||||||
|
summary_type=self.summary_type,
|
||||||
|
use_proj=self.use_proj,
|
||||||
|
)
|
||||||
|
|
||||||
|
return (
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
token_type_ids,
|
||||||
|
input_lengths,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
is_impossible_labels,
|
||||||
|
input_mask,
|
||||||
|
)
|
||||||
|
|
||||||
|
def check_loss_output(self, result):
|
||||||
|
self.parent.assertListEqual(list(result["loss"].size()), [])
|
||||||
|
|
||||||
|
def create_and_check_flaubert_model(
|
||||||
|
self,
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
token_type_ids,
|
||||||
|
input_lengths,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
is_impossible_labels,
|
||||||
|
input_mask,
|
||||||
|
):
|
||||||
|
model = FlaubertModel(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
outputs = model(input_ids, lengths=input_lengths, langs=token_type_ids)
|
||||||
|
outputs = model(input_ids, langs=token_type_ids)
|
||||||
|
outputs = model(input_ids)
|
||||||
|
sequence_output = outputs[0]
|
||||||
|
result = {
|
||||||
|
"sequence_output": sequence_output,
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_and_check_flaubert_lm_head(
|
||||||
|
self,
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
token_type_ids,
|
||||||
|
input_lengths,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
is_impossible_labels,
|
||||||
|
input_mask,
|
||||||
|
):
|
||||||
|
model = FlaubertWithLMHeadModel(config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
loss, logits = model(input_ids, token_type_ids=token_type_ids, labels=token_labels)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"loss": loss,
|
||||||
|
"logits": logits,
|
||||||
|
}
|
||||||
|
|
||||||
|
self.parent.assertListEqual(list(result["loss"].size()), [])
|
||||||
|
self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size])
|
||||||
|
|
||||||
|
def create_and_check_flaubert_simple_qa(
|
||||||
|
self,
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
token_type_ids,
|
||||||
|
input_lengths,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
is_impossible_labels,
|
||||||
|
input_mask,
|
||||||
|
):
|
||||||
|
model = FlaubertForQuestionAnsweringSimple(config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
outputs = model(input_ids)
|
||||||
|
|
||||||
|
outputs = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels)
|
||||||
|
loss, start_logits, end_logits = outputs
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"loss": loss,
|
||||||
|
"start_logits": start_logits,
|
||||||
|
"end_logits": end_logits,
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length])
|
||||||
|
self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length])
|
||||||
|
self.check_loss_output(result)
|
||||||
|
|
||||||
|
def create_and_check_flaubert_qa(
|
||||||
|
self,
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
token_type_ids,
|
||||||
|
input_lengths,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
is_impossible_labels,
|
||||||
|
input_mask,
|
||||||
|
):
|
||||||
|
model = FlaubertForQuestionAnswering(config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
outputs = model(input_ids)
|
||||||
|
start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = outputs
|
||||||
|
|
||||||
|
outputs = model(
|
||||||
|
input_ids,
|
||||||
|
start_positions=sequence_labels,
|
||||||
|
end_positions=sequence_labels,
|
||||||
|
cls_index=sequence_labels,
|
||||||
|
is_impossible=is_impossible_labels,
|
||||||
|
p_mask=input_mask,
|
||||||
|
)
|
||||||
|
|
||||||
|
outputs = model(
|
||||||
|
input_ids,
|
||||||
|
start_positions=sequence_labels,
|
||||||
|
end_positions=sequence_labels,
|
||||||
|
cls_index=sequence_labels,
|
||||||
|
is_impossible=is_impossible_labels,
|
||||||
|
)
|
||||||
|
|
||||||
|
(total_loss,) = outputs
|
||||||
|
|
||||||
|
outputs = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels)
|
||||||
|
|
||||||
|
(total_loss,) = outputs
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"loss": total_loss,
|
||||||
|
"start_top_log_probs": start_top_log_probs,
|
||||||
|
"start_top_index": start_top_index,
|
||||||
|
"end_top_log_probs": end_top_log_probs,
|
||||||
|
"end_top_index": end_top_index,
|
||||||
|
"cls_logits": cls_logits,
|
||||||
|
}
|
||||||
|
|
||||||
|
self.parent.assertListEqual(list(result["loss"].size()), [])
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["start_top_log_probs"].size()), [self.batch_size, model.config.start_n_top]
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["start_top_index"].size()), [self.batch_size, model.config.start_n_top]
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["end_top_log_probs"].size()),
|
||||||
|
[self.batch_size, model.config.start_n_top * model.config.end_n_top],
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["end_top_index"].size()), [self.batch_size, model.config.start_n_top * model.config.end_n_top],
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(list(result["cls_logits"].size()), [self.batch_size])
|
||||||
|
|
||||||
|
def create_and_check_flaubert_sequence_classif(
|
||||||
|
self,
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
token_type_ids,
|
||||||
|
input_lengths,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
is_impossible_labels,
|
||||||
|
input_mask,
|
||||||
|
):
|
||||||
|
model = FlaubertForSequenceClassification(config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
(logits,) = model(input_ids)
|
||||||
|
loss, logits = model(input_ids, labels=sequence_labels)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"loss": loss,
|
||||||
|
"logits": logits,
|
||||||
|
}
|
||||||
|
|
||||||
|
self.parent.assertListEqual(list(result["loss"].size()), [])
|
||||||
|
self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.type_sequence_label_size])
|
||||||
|
|
||||||
|
def prepare_config_and_inputs_for_common(self):
|
||||||
|
config_and_inputs = self.prepare_config_and_inputs()
|
||||||
|
(
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
token_type_ids,
|
||||||
|
input_lengths,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
is_impossible_labels,
|
||||||
|
input_mask,
|
||||||
|
) = config_and_inputs
|
||||||
|
inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "lengths": input_lengths}
|
||||||
|
return config, inputs_dict
|
||||||
|
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
class FlaubertModelTest(ModelTesterMixin, unittest.TestCase):
|
class FlaubertModelTest(ModelTesterMixin, unittest.TestCase):
|
||||||
|
|
||||||
|
@ -50,316 +325,8 @@ class FlaubertModelTest(ModelTesterMixin, unittest.TestCase):
|
||||||
else ()
|
else ()
|
||||||
)
|
)
|
||||||
|
|
||||||
class FlaubertModelTester(object):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
parent,
|
|
||||||
batch_size=13,
|
|
||||||
seq_length=7,
|
|
||||||
is_training=True,
|
|
||||||
use_input_lengths=True,
|
|
||||||
use_token_type_ids=True,
|
|
||||||
use_labels=True,
|
|
||||||
gelu_activation=True,
|
|
||||||
sinusoidal_embeddings=False,
|
|
||||||
causal=False,
|
|
||||||
asm=False,
|
|
||||||
n_langs=2,
|
|
||||||
vocab_size=99,
|
|
||||||
n_special=0,
|
|
||||||
hidden_size=32,
|
|
||||||
num_hidden_layers=5,
|
|
||||||
num_attention_heads=4,
|
|
||||||
hidden_dropout_prob=0.1,
|
|
||||||
attention_probs_dropout_prob=0.1,
|
|
||||||
max_position_embeddings=512,
|
|
||||||
type_vocab_size=16,
|
|
||||||
type_sequence_label_size=2,
|
|
||||||
initializer_range=0.02,
|
|
||||||
num_labels=3,
|
|
||||||
num_choices=4,
|
|
||||||
summary_type="last",
|
|
||||||
use_proj=True,
|
|
||||||
scope=None,
|
|
||||||
):
|
|
||||||
self.parent = parent
|
|
||||||
self.batch_size = batch_size
|
|
||||||
self.seq_length = seq_length
|
|
||||||
self.is_training = is_training
|
|
||||||
self.use_input_lengths = use_input_lengths
|
|
||||||
self.use_token_type_ids = use_token_type_ids
|
|
||||||
self.use_labels = use_labels
|
|
||||||
self.gelu_activation = gelu_activation
|
|
||||||
self.sinusoidal_embeddings = sinusoidal_embeddings
|
|
||||||
self.asm = asm
|
|
||||||
self.n_langs = n_langs
|
|
||||||
self.vocab_size = vocab_size
|
|
||||||
self.n_special = n_special
|
|
||||||
self.summary_type = summary_type
|
|
||||||
self.causal = causal
|
|
||||||
self.use_proj = use_proj
|
|
||||||
self.hidden_size = hidden_size
|
|
||||||
self.num_hidden_layers = num_hidden_layers
|
|
||||||
self.num_attention_heads = num_attention_heads
|
|
||||||
self.hidden_dropout_prob = hidden_dropout_prob
|
|
||||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
|
||||||
self.max_position_embeddings = max_position_embeddings
|
|
||||||
self.n_langs = n_langs
|
|
||||||
self.type_sequence_label_size = type_sequence_label_size
|
|
||||||
self.initializer_range = initializer_range
|
|
||||||
self.summary_type = summary_type
|
|
||||||
self.num_labels = num_labels
|
|
||||||
self.num_choices = num_choices
|
|
||||||
self.scope = scope
|
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
|
||||||
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
|
||||||
input_mask = ids_tensor([self.batch_size, self.seq_length], 2).float()
|
|
||||||
|
|
||||||
input_lengths = None
|
|
||||||
if self.use_input_lengths:
|
|
||||||
input_lengths = (
|
|
||||||
ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2
|
|
||||||
) # small variation of seq_length
|
|
||||||
|
|
||||||
token_type_ids = None
|
|
||||||
if self.use_token_type_ids:
|
|
||||||
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.n_langs)
|
|
||||||
|
|
||||||
sequence_labels = None
|
|
||||||
token_labels = None
|
|
||||||
is_impossible_labels = None
|
|
||||||
if self.use_labels:
|
|
||||||
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
|
||||||
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
|
||||||
is_impossible_labels = ids_tensor([self.batch_size], 2).float()
|
|
||||||
|
|
||||||
config = FlaubertConfig(
|
|
||||||
vocab_size=self.vocab_size,
|
|
||||||
n_special=self.n_special,
|
|
||||||
emb_dim=self.hidden_size,
|
|
||||||
n_layers=self.num_hidden_layers,
|
|
||||||
n_heads=self.num_attention_heads,
|
|
||||||
dropout=self.hidden_dropout_prob,
|
|
||||||
attention_dropout=self.attention_probs_dropout_prob,
|
|
||||||
gelu_activation=self.gelu_activation,
|
|
||||||
sinusoidal_embeddings=self.sinusoidal_embeddings,
|
|
||||||
asm=self.asm,
|
|
||||||
causal=self.causal,
|
|
||||||
n_langs=self.n_langs,
|
|
||||||
max_position_embeddings=self.max_position_embeddings,
|
|
||||||
initializer_range=self.initializer_range,
|
|
||||||
summary_type=self.summary_type,
|
|
||||||
use_proj=self.use_proj,
|
|
||||||
)
|
|
||||||
|
|
||||||
return (
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
token_type_ids,
|
|
||||||
input_lengths,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
is_impossible_labels,
|
|
||||||
input_mask,
|
|
||||||
)
|
|
||||||
|
|
||||||
def check_loss_output(self, result):
|
|
||||||
self.parent.assertListEqual(list(result["loss"].size()), [])
|
|
||||||
|
|
||||||
def create_and_check_flaubert_model(
|
|
||||||
self,
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
token_type_ids,
|
|
||||||
input_lengths,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
is_impossible_labels,
|
|
||||||
input_mask,
|
|
||||||
):
|
|
||||||
model = FlaubertModel(config=config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
outputs = model(input_ids, lengths=input_lengths, langs=token_type_ids)
|
|
||||||
outputs = model(input_ids, langs=token_type_ids)
|
|
||||||
outputs = model(input_ids)
|
|
||||||
sequence_output = outputs[0]
|
|
||||||
result = {
|
|
||||||
"sequence_output": sequence_output,
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_flaubert_lm_head(
|
|
||||||
self,
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
token_type_ids,
|
|
||||||
input_lengths,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
is_impossible_labels,
|
|
||||||
input_mask,
|
|
||||||
):
|
|
||||||
model = FlaubertWithLMHeadModel(config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
loss, logits = model(input_ids, token_type_ids=token_type_ids, labels=token_labels)
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"loss": loss,
|
|
||||||
"logits": logits,
|
|
||||||
}
|
|
||||||
|
|
||||||
self.parent.assertListEqual(list(result["loss"].size()), [])
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_flaubert_simple_qa(
|
|
||||||
self,
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
token_type_ids,
|
|
||||||
input_lengths,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
is_impossible_labels,
|
|
||||||
input_mask,
|
|
||||||
):
|
|
||||||
model = FlaubertForQuestionAnsweringSimple(config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
outputs = model(input_ids)
|
|
||||||
|
|
||||||
outputs = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels)
|
|
||||||
loss, start_logits, end_logits = outputs
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"loss": loss,
|
|
||||||
"start_logits": start_logits,
|
|
||||||
"end_logits": end_logits,
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length])
|
|
||||||
self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length])
|
|
||||||
self.check_loss_output(result)
|
|
||||||
|
|
||||||
def create_and_check_flaubert_qa(
|
|
||||||
self,
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
token_type_ids,
|
|
||||||
input_lengths,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
is_impossible_labels,
|
|
||||||
input_mask,
|
|
||||||
):
|
|
||||||
model = FlaubertForQuestionAnswering(config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
outputs = model(input_ids)
|
|
||||||
start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = outputs
|
|
||||||
|
|
||||||
outputs = model(
|
|
||||||
input_ids,
|
|
||||||
start_positions=sequence_labels,
|
|
||||||
end_positions=sequence_labels,
|
|
||||||
cls_index=sequence_labels,
|
|
||||||
is_impossible=is_impossible_labels,
|
|
||||||
p_mask=input_mask,
|
|
||||||
)
|
|
||||||
|
|
||||||
outputs = model(
|
|
||||||
input_ids,
|
|
||||||
start_positions=sequence_labels,
|
|
||||||
end_positions=sequence_labels,
|
|
||||||
cls_index=sequence_labels,
|
|
||||||
is_impossible=is_impossible_labels,
|
|
||||||
)
|
|
||||||
|
|
||||||
(total_loss,) = outputs
|
|
||||||
|
|
||||||
outputs = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels)
|
|
||||||
|
|
||||||
(total_loss,) = outputs
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"loss": total_loss,
|
|
||||||
"start_top_log_probs": start_top_log_probs,
|
|
||||||
"start_top_index": start_top_index,
|
|
||||||
"end_top_log_probs": end_top_log_probs,
|
|
||||||
"end_top_index": end_top_index,
|
|
||||||
"cls_logits": cls_logits,
|
|
||||||
}
|
|
||||||
|
|
||||||
self.parent.assertListEqual(list(result["loss"].size()), [])
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["start_top_log_probs"].size()), [self.batch_size, model.config.start_n_top]
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["start_top_index"].size()), [self.batch_size, model.config.start_n_top]
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["end_top_log_probs"].size()),
|
|
||||||
[self.batch_size, model.config.start_n_top * model.config.end_n_top],
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["end_top_index"].size()),
|
|
||||||
[self.batch_size, model.config.start_n_top * model.config.end_n_top],
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(list(result["cls_logits"].size()), [self.batch_size])
|
|
||||||
|
|
||||||
def create_and_check_flaubert_sequence_classif(
|
|
||||||
self,
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
token_type_ids,
|
|
||||||
input_lengths,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
is_impossible_labels,
|
|
||||||
input_mask,
|
|
||||||
):
|
|
||||||
model = FlaubertForSequenceClassification(config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
(logits,) = model(input_ids)
|
|
||||||
loss, logits = model(input_ids, labels=sequence_labels)
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"loss": loss,
|
|
||||||
"logits": logits,
|
|
||||||
}
|
|
||||||
|
|
||||||
self.parent.assertListEqual(list(result["loss"].size()), [])
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["logits"].size()), [self.batch_size, self.type_sequence_label_size]
|
|
||||||
)
|
|
||||||
|
|
||||||
def prepare_config_and_inputs_for_common(self):
|
|
||||||
config_and_inputs = self.prepare_config_and_inputs()
|
|
||||||
(
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
token_type_ids,
|
|
||||||
input_lengths,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
is_impossible_labels,
|
|
||||||
input_mask,
|
|
||||||
) = config_and_inputs
|
|
||||||
inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "lengths": input_lengths}
|
|
||||||
return config, inputs_dict
|
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = FlaubertModelTest.FlaubertModelTester(self)
|
self.model_tester = FlaubertModelTester(self)
|
||||||
self.config_tester = ConfigTester(self, config_class=FlaubertConfig, emb_dim=37)
|
self.config_tester = ConfigTester(self, config_class=FlaubertConfig, emb_dim=37)
|
||||||
|
|
||||||
def test_config(self):
|
def test_config(self):
|
||||||
|
|
|
@ -34,6 +34,269 @@ if is_torch_available():
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class GPT2ModelTester:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
parent,
|
||||||
|
batch_size=14,
|
||||||
|
seq_length=7,
|
||||||
|
is_training=True,
|
||||||
|
use_token_type_ids=True,
|
||||||
|
use_input_mask=True,
|
||||||
|
use_labels=True,
|
||||||
|
use_mc_token_ids=True,
|
||||||
|
vocab_size=99,
|
||||||
|
hidden_size=32,
|
||||||
|
num_hidden_layers=5,
|
||||||
|
num_attention_heads=4,
|
||||||
|
intermediate_size=37,
|
||||||
|
hidden_act="gelu",
|
||||||
|
hidden_dropout_prob=0.1,
|
||||||
|
attention_probs_dropout_prob=0.1,
|
||||||
|
max_position_embeddings=512,
|
||||||
|
type_vocab_size=16,
|
||||||
|
type_sequence_label_size=2,
|
||||||
|
initializer_range=0.02,
|
||||||
|
num_labels=3,
|
||||||
|
num_choices=4,
|
||||||
|
scope=None,
|
||||||
|
):
|
||||||
|
self.parent = parent
|
||||||
|
self.batch_size = 14
|
||||||
|
self.seq_length = 7
|
||||||
|
self.is_training = True
|
||||||
|
self.use_token_type_ids = True
|
||||||
|
self.use_input_mask = True
|
||||||
|
self.use_labels = True
|
||||||
|
self.use_mc_token_ids = True
|
||||||
|
self.vocab_size = 99
|
||||||
|
self.hidden_size = 32
|
||||||
|
self.num_hidden_layers = 5
|
||||||
|
self.num_attention_heads = 4
|
||||||
|
self.intermediate_size = 37
|
||||||
|
self.hidden_act = "gelu"
|
||||||
|
self.hidden_dropout_prob = 0.1
|
||||||
|
self.attention_probs_dropout_prob = 0, 1
|
||||||
|
self.max_position_embeddings = 512
|
||||||
|
self.type_vocab_size = 16
|
||||||
|
self.type_sequence_label_size = 2
|
||||||
|
self.initializer_range = 0.02
|
||||||
|
self.num_labels = 3
|
||||||
|
self.num_choices = 4
|
||||||
|
self.scope = None
|
||||||
|
self.bos_token_id = vocab_size - 1
|
||||||
|
self.eos_token_id = vocab_size - 1
|
||||||
|
|
||||||
|
def prepare_config_and_inputs(self):
|
||||||
|
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
|
||||||
|
input_mask = None
|
||||||
|
if self.use_input_mask:
|
||||||
|
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
|
||||||
|
|
||||||
|
token_type_ids = None
|
||||||
|
if self.use_token_type_ids:
|
||||||
|
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
||||||
|
|
||||||
|
mc_token_ids = None
|
||||||
|
if self.use_mc_token_ids:
|
||||||
|
mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
|
||||||
|
|
||||||
|
sequence_labels = None
|
||||||
|
token_labels = None
|
||||||
|
choice_labels = None
|
||||||
|
if self.use_labels:
|
||||||
|
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||||
|
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
||||||
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
|
config = GPT2Config(
|
||||||
|
vocab_size=self.vocab_size,
|
||||||
|
n_embd=self.hidden_size,
|
||||||
|
n_layer=self.num_hidden_layers,
|
||||||
|
n_head=self.num_attention_heads,
|
||||||
|
# intermediate_size=self.intermediate_size,
|
||||||
|
# hidden_act=self.hidden_act,
|
||||||
|
# hidden_dropout_prob=self.hidden_dropout_prob,
|
||||||
|
# attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
||||||
|
n_positions=self.max_position_embeddings,
|
||||||
|
n_ctx=self.max_position_embeddings,
|
||||||
|
# type_vocab_size=self.type_vocab_size,
|
||||||
|
# initializer_range=self.initializer_range
|
||||||
|
bos_token_id=self.bos_token_id,
|
||||||
|
eos_token_id=self.eos_token_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
|
||||||
|
|
||||||
|
return (
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
input_mask,
|
||||||
|
head_mask,
|
||||||
|
token_type_ids,
|
||||||
|
mc_token_ids,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
choice_labels,
|
||||||
|
)
|
||||||
|
|
||||||
|
def check_loss_output(self, result):
|
||||||
|
self.parent.assertListEqual(list(result["loss"].size()), [])
|
||||||
|
|
||||||
|
def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
||||||
|
model = GPT2Model(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
|
||||||
|
model(input_ids, token_type_ids=token_type_ids)
|
||||||
|
sequence_output, presents = model(input_ids)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"sequence_output": sequence_output,
|
||||||
|
"presents": presents,
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size],
|
||||||
|
)
|
||||||
|
self.parent.assertEqual(len(result["presents"]), config.n_layer)
|
||||||
|
|
||||||
|
def create_and_check_gpt2_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
||||||
|
model = GPT2Model(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
# first forward pass
|
||||||
|
output, past = model(input_ids, token_type_ids=token_type_ids)
|
||||||
|
|
||||||
|
# create hypothetical next token and extent to next_input_ids
|
||||||
|
next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
|
||||||
|
next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size)
|
||||||
|
|
||||||
|
# append to next input_ids and token_type_ids
|
||||||
|
next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
|
||||||
|
next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1)
|
||||||
|
|
||||||
|
output_from_no_past, _ = model(next_input_ids, token_type_ids=next_token_type_ids)
|
||||||
|
output_from_past, _ = model(next_tokens, token_type_ids=next_token_types, past=past)
|
||||||
|
|
||||||
|
# select random slice
|
||||||
|
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
|
||||||
|
output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
|
||||||
|
output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
|
||||||
|
|
||||||
|
# test that outputs are equal for slice
|
||||||
|
self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
|
||||||
|
|
||||||
|
def create_and_check_gpt2_model_attention_mask_past(
|
||||||
|
self, config, input_ids, input_mask, head_mask, token_type_ids, *args
|
||||||
|
):
|
||||||
|
model = GPT2Model(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
# create attention mask
|
||||||
|
attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
|
||||||
|
half_seq_length = self.seq_length // 2
|
||||||
|
attn_mask[:, half_seq_length:] = 0
|
||||||
|
|
||||||
|
# first forward pass
|
||||||
|
output, past = model(input_ids, attention_mask=attn_mask)
|
||||||
|
|
||||||
|
# create hypothetical next token and extent to next_input_ids
|
||||||
|
next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
|
||||||
|
|
||||||
|
# change a random masked slice from input_ids
|
||||||
|
random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
|
||||||
|
random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
|
||||||
|
input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
|
||||||
|
|
||||||
|
# append to next input_ids and attn_mask
|
||||||
|
next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
|
||||||
|
attn_mask = torch.cat(
|
||||||
|
[attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)], dim=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
# get two different outputs
|
||||||
|
output_from_no_past, _ = model(next_input_ids, attention_mask=attn_mask)
|
||||||
|
output_from_past, _ = model(next_tokens, past=past, attention_mask=attn_mask)
|
||||||
|
|
||||||
|
# select random slice
|
||||||
|
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
|
||||||
|
output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
|
||||||
|
output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
|
||||||
|
|
||||||
|
# test that outputs are equal for slice
|
||||||
|
self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
|
||||||
|
|
||||||
|
def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
||||||
|
model = GPT2LMHeadModel(config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
|
||||||
|
|
||||||
|
result = {"loss": loss, "lm_logits": lm_logits}
|
||||||
|
|
||||||
|
self.parent.assertListEqual(list(result["loss"].size()), [])
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size],
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_and_check_double_lm_head_model(
|
||||||
|
self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args
|
||||||
|
):
|
||||||
|
model = GPT2DoubleHeadsModel(config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
|
||||||
|
multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
|
||||||
|
multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
|
||||||
|
|
||||||
|
inputs = {
|
||||||
|
"input_ids": multiple_choice_inputs_ids,
|
||||||
|
"mc_token_ids": mc_token_ids,
|
||||||
|
"attention_mask": multiple_choice_input_mask,
|
||||||
|
"token_type_ids": multiple_choice_token_type_ids,
|
||||||
|
"labels": multiple_choice_inputs_ids,
|
||||||
|
}
|
||||||
|
|
||||||
|
loss, lm_logits, mc_logits, _ = model(**inputs)
|
||||||
|
|
||||||
|
result = {"loss": loss, "lm_logits": lm_logits, "mc_logits": mc_logits}
|
||||||
|
|
||||||
|
self.parent.assertListEqual(list(result["loss"].size()), [])
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["lm_logits"].size()), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size],
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(list(result["mc_logits"].size()), [self.batch_size, self.num_choices])
|
||||||
|
|
||||||
|
def prepare_config_and_inputs_for_common(self):
|
||||||
|
config_and_inputs = self.prepare_config_and_inputs()
|
||||||
|
|
||||||
|
(
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
input_mask,
|
||||||
|
head_mask,
|
||||||
|
token_type_ids,
|
||||||
|
mc_token_ids,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
choice_labels,
|
||||||
|
) = config_and_inputs
|
||||||
|
|
||||||
|
inputs_dict = {
|
||||||
|
"input_ids": input_ids,
|
||||||
|
"token_type_ids": token_type_ids,
|
||||||
|
"head_mask": head_mask,
|
||||||
|
}
|
||||||
|
|
||||||
|
return config, inputs_dict
|
||||||
|
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
class GPT2ModelTest(ModelTesterMixin, unittest.TestCase):
|
class GPT2ModelTest(ModelTesterMixin, unittest.TestCase):
|
||||||
|
|
||||||
|
@ -42,271 +305,8 @@ class GPT2ModelTest(ModelTesterMixin, unittest.TestCase):
|
||||||
(GPT2LMHeadModel,) if is_torch_available() else ()
|
(GPT2LMHeadModel,) if is_torch_available() else ()
|
||||||
) # TODO (PVP): Add Double HeadsModel when generate() function is changed accordingly
|
) # TODO (PVP): Add Double HeadsModel when generate() function is changed accordingly
|
||||||
|
|
||||||
class GPT2ModelTester(object):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
parent,
|
|
||||||
batch_size=14,
|
|
||||||
seq_length=7,
|
|
||||||
is_training=True,
|
|
||||||
use_token_type_ids=True,
|
|
||||||
use_input_mask=True,
|
|
||||||
use_labels=True,
|
|
||||||
use_mc_token_ids=True,
|
|
||||||
vocab_size=99,
|
|
||||||
hidden_size=32,
|
|
||||||
num_hidden_layers=5,
|
|
||||||
num_attention_heads=4,
|
|
||||||
intermediate_size=37,
|
|
||||||
hidden_act="gelu",
|
|
||||||
hidden_dropout_prob=0.1,
|
|
||||||
attention_probs_dropout_prob=0.1,
|
|
||||||
max_position_embeddings=512,
|
|
||||||
type_vocab_size=16,
|
|
||||||
type_sequence_label_size=2,
|
|
||||||
initializer_range=0.02,
|
|
||||||
num_labels=3,
|
|
||||||
num_choices=4,
|
|
||||||
scope=None,
|
|
||||||
):
|
|
||||||
self.parent = parent
|
|
||||||
self.batch_size = batch_size
|
|
||||||
self.seq_length = seq_length
|
|
||||||
self.is_training = is_training
|
|
||||||
self.use_token_type_ids = use_token_type_ids
|
|
||||||
self.use_input_mask = use_input_mask
|
|
||||||
self.use_labels = use_labels
|
|
||||||
self.use_mc_token_ids = use_mc_token_ids
|
|
||||||
self.vocab_size = vocab_size
|
|
||||||
self.hidden_size = hidden_size
|
|
||||||
self.num_hidden_layers = num_hidden_layers
|
|
||||||
self.num_attention_heads = num_attention_heads
|
|
||||||
self.intermediate_size = intermediate_size
|
|
||||||
self.hidden_act = hidden_act
|
|
||||||
self.hidden_dropout_prob = hidden_dropout_prob
|
|
||||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
|
||||||
self.max_position_embeddings = max_position_embeddings
|
|
||||||
self.type_vocab_size = type_vocab_size
|
|
||||||
self.type_sequence_label_size = type_sequence_label_size
|
|
||||||
self.initializer_range = initializer_range
|
|
||||||
self.num_labels = num_labels
|
|
||||||
self.num_choices = num_choices
|
|
||||||
self.scope = scope
|
|
||||||
self.bos_token_id = vocab_size - 1
|
|
||||||
self.eos_token_id = vocab_size - 1
|
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
|
||||||
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
|
||||||
|
|
||||||
input_mask = None
|
|
||||||
if self.use_input_mask:
|
|
||||||
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
|
|
||||||
|
|
||||||
token_type_ids = None
|
|
||||||
if self.use_token_type_ids:
|
|
||||||
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
|
||||||
|
|
||||||
mc_token_ids = None
|
|
||||||
if self.use_mc_token_ids:
|
|
||||||
mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
|
|
||||||
|
|
||||||
sequence_labels = None
|
|
||||||
token_labels = None
|
|
||||||
choice_labels = None
|
|
||||||
if self.use_labels:
|
|
||||||
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
|
||||||
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
|
||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
|
||||||
|
|
||||||
config = GPT2Config(
|
|
||||||
vocab_size=self.vocab_size,
|
|
||||||
n_embd=self.hidden_size,
|
|
||||||
n_layer=self.num_hidden_layers,
|
|
||||||
n_head=self.num_attention_heads,
|
|
||||||
# intermediate_size=self.intermediate_size,
|
|
||||||
# hidden_act=self.hidden_act,
|
|
||||||
# hidden_dropout_prob=self.hidden_dropout_prob,
|
|
||||||
# attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
|
||||||
n_positions=self.max_position_embeddings,
|
|
||||||
n_ctx=self.max_position_embeddings,
|
|
||||||
# type_vocab_size=self.type_vocab_size,
|
|
||||||
# initializer_range=self.initializer_range
|
|
||||||
bos_token_id=self.bos_token_id,
|
|
||||||
eos_token_id=self.eos_token_id,
|
|
||||||
)
|
|
||||||
|
|
||||||
head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
|
|
||||||
|
|
||||||
return (
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
input_mask,
|
|
||||||
head_mask,
|
|
||||||
token_type_ids,
|
|
||||||
mc_token_ids,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
choice_labels,
|
|
||||||
)
|
|
||||||
|
|
||||||
def check_loss_output(self, result):
|
|
||||||
self.parent.assertListEqual(list(result["loss"].size()), [])
|
|
||||||
|
|
||||||
def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
|
||||||
model = GPT2Model(config=config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
|
|
||||||
model(input_ids, token_type_ids=token_type_ids)
|
|
||||||
sequence_output, presents = model(input_ids)
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"sequence_output": sequence_output,
|
|
||||||
"presents": presents,
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size],
|
|
||||||
)
|
|
||||||
self.parent.assertEqual(len(result["presents"]), config.n_layer)
|
|
||||||
|
|
||||||
def create_and_check_gpt2_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
|
||||||
model = GPT2Model(config=config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
# first forward pass
|
|
||||||
output, past = model(input_ids, token_type_ids=token_type_ids)
|
|
||||||
|
|
||||||
# create hypothetical next token and extent to next_input_ids
|
|
||||||
next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
|
|
||||||
next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size)
|
|
||||||
|
|
||||||
# append to next input_ids and token_type_ids
|
|
||||||
next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
|
|
||||||
next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1)
|
|
||||||
|
|
||||||
output_from_no_past, _ = model(next_input_ids, token_type_ids=next_token_type_ids)
|
|
||||||
output_from_past, _ = model(next_tokens, token_type_ids=next_token_types, past=past)
|
|
||||||
|
|
||||||
# select random slice
|
|
||||||
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
|
|
||||||
output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
|
|
||||||
output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
|
|
||||||
|
|
||||||
# test that outputs are equal for slice
|
|
||||||
self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
|
|
||||||
|
|
||||||
def create_and_check_gpt2_model_attention_mask_past(
|
|
||||||
self, config, input_ids, input_mask, head_mask, token_type_ids, *args
|
|
||||||
):
|
|
||||||
model = GPT2Model(config=config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
# create attention mask
|
|
||||||
attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
|
|
||||||
half_seq_length = self.seq_length // 2
|
|
||||||
attn_mask[:, half_seq_length:] = 0
|
|
||||||
|
|
||||||
# first forward pass
|
|
||||||
output, past = model(input_ids, attention_mask=attn_mask)
|
|
||||||
|
|
||||||
# create hypothetical next token and extent to next_input_ids
|
|
||||||
next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
|
|
||||||
|
|
||||||
# change a random masked slice from input_ids
|
|
||||||
random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
|
|
||||||
random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
|
|
||||||
input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
|
|
||||||
|
|
||||||
# append to next input_ids and attn_mask
|
|
||||||
next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
|
|
||||||
attn_mask = torch.cat(
|
|
||||||
[attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)], dim=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
# get two different outputs
|
|
||||||
output_from_no_past, _ = model(next_input_ids, attention_mask=attn_mask)
|
|
||||||
output_from_past, _ = model(next_tokens, past=past, attention_mask=attn_mask)
|
|
||||||
|
|
||||||
# select random slice
|
|
||||||
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
|
|
||||||
output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
|
|
||||||
output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
|
|
||||||
|
|
||||||
# test that outputs are equal for slice
|
|
||||||
self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
|
|
||||||
|
|
||||||
def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
|
||||||
model = GPT2LMHeadModel(config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
|
|
||||||
|
|
||||||
result = {"loss": loss, "lm_logits": lm_logits}
|
|
||||||
|
|
||||||
self.parent.assertListEqual(list(result["loss"].size()), [])
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size],
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_double_lm_head_model(
|
|
||||||
self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args
|
|
||||||
):
|
|
||||||
model = GPT2DoubleHeadsModel(config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
|
|
||||||
multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
|
|
||||||
multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
|
|
||||||
|
|
||||||
inputs = {
|
|
||||||
"input_ids": multiple_choice_inputs_ids,
|
|
||||||
"mc_token_ids": mc_token_ids,
|
|
||||||
"attention_mask": multiple_choice_input_mask,
|
|
||||||
"token_type_ids": multiple_choice_token_type_ids,
|
|
||||||
"labels": multiple_choice_inputs_ids,
|
|
||||||
}
|
|
||||||
|
|
||||||
loss, lm_logits, mc_logits, _ = model(**inputs)
|
|
||||||
|
|
||||||
result = {"loss": loss, "lm_logits": lm_logits, "mc_logits": mc_logits}
|
|
||||||
|
|
||||||
self.parent.assertListEqual(list(result["loss"].size()), [])
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["lm_logits"].size()),
|
|
||||||
[self.batch_size, self.num_choices, self.seq_length, self.vocab_size],
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(list(result["mc_logits"].size()), [self.batch_size, self.num_choices])
|
|
||||||
|
|
||||||
def prepare_config_and_inputs_for_common(self):
|
|
||||||
config_and_inputs = self.prepare_config_and_inputs()
|
|
||||||
|
|
||||||
(
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
input_mask,
|
|
||||||
head_mask,
|
|
||||||
token_type_ids,
|
|
||||||
mc_token_ids,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
choice_labels,
|
|
||||||
) = config_and_inputs
|
|
||||||
|
|
||||||
inputs_dict = {
|
|
||||||
"input_ids": input_ids,
|
|
||||||
"token_type_ids": token_type_ids,
|
|
||||||
"head_mask": head_mask,
|
|
||||||
}
|
|
||||||
|
|
||||||
return config, inputs_dict
|
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = GPT2ModelTest.GPT2ModelTester(self)
|
self.model_tester = GPT2ModelTester(self)
|
||||||
self.config_tester = ConfigTester(self, config_class=GPT2Config, n_embd=37)
|
self.config_tester = ConfigTester(self, config_class=GPT2Config, n_embd=37)
|
||||||
|
|
||||||
def test_config(self):
|
def test_config(self):
|
||||||
|
|
|
@ -36,56 +36,33 @@ if is_torch_available():
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class LongformerModelTester(object):
|
class LongformerModelTester:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self, parent,
|
||||||
parent,
|
|
||||||
batch_size=13,
|
|
||||||
seq_length=7,
|
|
||||||
is_training=True,
|
|
||||||
use_input_mask=True,
|
|
||||||
use_token_type_ids=True,
|
|
||||||
use_labels=True,
|
|
||||||
vocab_size=99,
|
|
||||||
hidden_size=32,
|
|
||||||
num_hidden_layers=5,
|
|
||||||
num_attention_heads=4,
|
|
||||||
intermediate_size=37,
|
|
||||||
hidden_act="gelu",
|
|
||||||
hidden_dropout_prob=0.1,
|
|
||||||
attention_probs_dropout_prob=0.1,
|
|
||||||
max_position_embeddings=512,
|
|
||||||
type_vocab_size=16,
|
|
||||||
type_sequence_label_size=2,
|
|
||||||
initializer_range=0.02,
|
|
||||||
num_labels=3,
|
|
||||||
num_choices=4,
|
|
||||||
scope=None,
|
|
||||||
attention_window=4,
|
|
||||||
):
|
):
|
||||||
self.parent = parent
|
self.parent = parent
|
||||||
self.batch_size = batch_size
|
self.batch_size = 13
|
||||||
self.seq_length = seq_length
|
self.seq_length = 7
|
||||||
self.is_training = is_training
|
self.is_training = True
|
||||||
self.use_input_mask = use_input_mask
|
self.use_input_mask = True
|
||||||
self.use_token_type_ids = use_token_type_ids
|
self.use_token_type_ids = True
|
||||||
self.use_labels = use_labels
|
self.use_labels = True
|
||||||
self.vocab_size = vocab_size
|
self.vocab_size = 99
|
||||||
self.hidden_size = hidden_size
|
self.hidden_size = 32
|
||||||
self.num_hidden_layers = num_hidden_layers
|
self.num_hidden_layers = 5
|
||||||
self.num_attention_heads = num_attention_heads
|
self.num_attention_heads = 4
|
||||||
self.intermediate_size = intermediate_size
|
self.intermediate_size = 37
|
||||||
self.hidden_act = hidden_act
|
self.hidden_act = "gelu"
|
||||||
self.hidden_dropout_prob = hidden_dropout_prob
|
self.hidden_dropout_prob = 0.1
|
||||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
self.attention_probs_dropout_prob = 0.1
|
||||||
self.max_position_embeddings = max_position_embeddings
|
self.max_position_embeddings = 512
|
||||||
self.type_vocab_size = type_vocab_size
|
self.type_vocab_size = 16
|
||||||
self.type_sequence_label_size = type_sequence_label_size
|
self.type_sequence_label_size = 2
|
||||||
self.initializer_range = initializer_range
|
self.initializer_range = 0.02
|
||||||
self.num_labels = num_labels
|
self.num_labels = 3
|
||||||
self.num_choices = num_choices
|
self.num_choices = 4
|
||||||
self.scope = scope
|
self.scope = None
|
||||||
self.attention_window = attention_window
|
self.attention_window = 4
|
||||||
|
|
||||||
# `ModelTesterMixin.test_attention_outputs` is expecting attention tensors to be of size
|
# `ModelTesterMixin.test_attention_outputs` is expecting attention tensors to be of size
|
||||||
# [num_attention_heads, encoder_seq_length, encoder_key_length], but LongformerSelfAttention
|
# [num_attention_heads, encoder_seq_length, encoder_key_length], but LongformerSelfAttention
|
||||||
|
|
|
@ -34,6 +34,139 @@ if is_torch_available():
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class OpenAIGPTModelTester:
|
||||||
|
def __init__(
|
||||||
|
self, parent,
|
||||||
|
):
|
||||||
|
self.parent = parent
|
||||||
|
self.batch_size = 13
|
||||||
|
self.seq_length = 7
|
||||||
|
self.is_training = True
|
||||||
|
self.use_token_type_ids = True
|
||||||
|
self.use_labels = True
|
||||||
|
self.vocab_size = 99
|
||||||
|
self.hidden_size = 32
|
||||||
|
self.num_hidden_layers = 5
|
||||||
|
self.num_attention_heads = 4
|
||||||
|
self.intermediate_size = 37
|
||||||
|
self.hidden_act = "gelu"
|
||||||
|
self.hidden_dropout_prob = 0.1
|
||||||
|
self.attention_probs_dropout_prob = 0.1
|
||||||
|
self.max_position_embeddings = 512
|
||||||
|
self.type_vocab_size = 16
|
||||||
|
self.type_sequence_label_size = 2
|
||||||
|
self.initializer_range = 0.02
|
||||||
|
self.num_labels = 3
|
||||||
|
self.num_choices = 4
|
||||||
|
self.scope = None
|
||||||
|
|
||||||
|
def prepare_config_and_inputs(self):
|
||||||
|
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
|
||||||
|
token_type_ids = None
|
||||||
|
if self.use_token_type_ids:
|
||||||
|
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
||||||
|
|
||||||
|
sequence_labels = None
|
||||||
|
token_labels = None
|
||||||
|
choice_labels = None
|
||||||
|
if self.use_labels:
|
||||||
|
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||||
|
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
||||||
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
|
config = OpenAIGPTConfig(
|
||||||
|
vocab_size=self.vocab_size,
|
||||||
|
n_embd=self.hidden_size,
|
||||||
|
n_layer=self.num_hidden_layers,
|
||||||
|
n_head=self.num_attention_heads,
|
||||||
|
# intermediate_size=self.intermediate_size,
|
||||||
|
# hidden_act=self.hidden_act,
|
||||||
|
# hidden_dropout_prob=self.hidden_dropout_prob,
|
||||||
|
# attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
||||||
|
n_positions=self.max_position_embeddings,
|
||||||
|
n_ctx=self.max_position_embeddings
|
||||||
|
# type_vocab_size=self.type_vocab_size,
|
||||||
|
# initializer_range=self.initializer_range
|
||||||
|
)
|
||||||
|
|
||||||
|
head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
|
||||||
|
|
||||||
|
return (
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
head_mask,
|
||||||
|
token_type_ids,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
choice_labels,
|
||||||
|
)
|
||||||
|
|
||||||
|
def check_loss_output(self, result):
|
||||||
|
self.parent.assertListEqual(list(result["loss"].size()), [])
|
||||||
|
|
||||||
|
def create_and_check_openai_gpt_model(self, config, input_ids, head_mask, token_type_ids, *args):
|
||||||
|
model = OpenAIGPTModel(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
|
||||||
|
model(input_ids, token_type_ids=token_type_ids)
|
||||||
|
(sequence_output,) = model(input_ids)
|
||||||
|
|
||||||
|
result = {"sequence_output": sequence_output}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size],
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
|
||||||
|
model = OpenAIGPTLMHeadModel(config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
loss, lm_logits = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
|
||||||
|
|
||||||
|
result = {"loss": loss, "lm_logits": lm_logits}
|
||||||
|
|
||||||
|
self.parent.assertListEqual(list(result["loss"].size()), [])
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size],
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
|
||||||
|
model = OpenAIGPTDoubleHeadsModel(config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
loss, lm_logits, mc_logits = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
|
||||||
|
|
||||||
|
result = {"loss": loss, "lm_logits": lm_logits}
|
||||||
|
|
||||||
|
self.parent.assertListEqual(list(result["loss"].size()), [])
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size],
|
||||||
|
)
|
||||||
|
|
||||||
|
def prepare_config_and_inputs_for_common(self):
|
||||||
|
config_and_inputs = self.prepare_config_and_inputs()
|
||||||
|
(
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
head_mask,
|
||||||
|
token_type_ids,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
choice_labels,
|
||||||
|
) = config_and_inputs
|
||||||
|
inputs_dict = {
|
||||||
|
"input_ids": input_ids,
|
||||||
|
"token_type_ids": token_type_ids,
|
||||||
|
"head_mask": head_mask,
|
||||||
|
}
|
||||||
|
|
||||||
|
return config, inputs_dict
|
||||||
|
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
class OpenAIGPTModelTest(ModelTesterMixin, unittest.TestCase):
|
class OpenAIGPTModelTest(ModelTesterMixin, unittest.TestCase):
|
||||||
|
|
||||||
|
@ -44,161 +177,8 @@ class OpenAIGPTModelTest(ModelTesterMixin, unittest.TestCase):
|
||||||
(OpenAIGPTLMHeadModel,) if is_torch_available() else ()
|
(OpenAIGPTLMHeadModel,) if is_torch_available() else ()
|
||||||
) # TODO (PVP): Add Double HeadsModel when generate() function is changed accordingly
|
) # TODO (PVP): Add Double HeadsModel when generate() function is changed accordingly
|
||||||
|
|
||||||
class OpenAIGPTModelTester(object):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
parent,
|
|
||||||
batch_size=13,
|
|
||||||
seq_length=7,
|
|
||||||
is_training=True,
|
|
||||||
use_token_type_ids=True,
|
|
||||||
use_labels=True,
|
|
||||||
vocab_size=99,
|
|
||||||
hidden_size=32,
|
|
||||||
num_hidden_layers=5,
|
|
||||||
num_attention_heads=4,
|
|
||||||
intermediate_size=37,
|
|
||||||
hidden_act="gelu",
|
|
||||||
hidden_dropout_prob=0.1,
|
|
||||||
attention_probs_dropout_prob=0.1,
|
|
||||||
max_position_embeddings=512,
|
|
||||||
type_vocab_size=16,
|
|
||||||
type_sequence_label_size=2,
|
|
||||||
initializer_range=0.02,
|
|
||||||
num_labels=3,
|
|
||||||
num_choices=4,
|
|
||||||
scope=None,
|
|
||||||
):
|
|
||||||
self.parent = parent
|
|
||||||
self.batch_size = batch_size
|
|
||||||
self.seq_length = seq_length
|
|
||||||
self.is_training = is_training
|
|
||||||
self.use_token_type_ids = use_token_type_ids
|
|
||||||
self.use_labels = use_labels
|
|
||||||
self.vocab_size = vocab_size
|
|
||||||
self.hidden_size = hidden_size
|
|
||||||
self.num_hidden_layers = num_hidden_layers
|
|
||||||
self.num_attention_heads = num_attention_heads
|
|
||||||
self.intermediate_size = intermediate_size
|
|
||||||
self.hidden_act = hidden_act
|
|
||||||
self.hidden_dropout_prob = hidden_dropout_prob
|
|
||||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
|
||||||
self.max_position_embeddings = max_position_embeddings
|
|
||||||
self.type_vocab_size = type_vocab_size
|
|
||||||
self.type_sequence_label_size = type_sequence_label_size
|
|
||||||
self.initializer_range = initializer_range
|
|
||||||
self.num_labels = num_labels
|
|
||||||
self.num_choices = num_choices
|
|
||||||
self.scope = scope
|
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
|
||||||
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
|
||||||
|
|
||||||
token_type_ids = None
|
|
||||||
if self.use_token_type_ids:
|
|
||||||
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
|
||||||
|
|
||||||
sequence_labels = None
|
|
||||||
token_labels = None
|
|
||||||
choice_labels = None
|
|
||||||
if self.use_labels:
|
|
||||||
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
|
||||||
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
|
||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
|
||||||
|
|
||||||
config = OpenAIGPTConfig(
|
|
||||||
vocab_size=self.vocab_size,
|
|
||||||
n_embd=self.hidden_size,
|
|
||||||
n_layer=self.num_hidden_layers,
|
|
||||||
n_head=self.num_attention_heads,
|
|
||||||
# intermediate_size=self.intermediate_size,
|
|
||||||
# hidden_act=self.hidden_act,
|
|
||||||
# hidden_dropout_prob=self.hidden_dropout_prob,
|
|
||||||
# attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
|
||||||
n_positions=self.max_position_embeddings,
|
|
||||||
n_ctx=self.max_position_embeddings
|
|
||||||
# type_vocab_size=self.type_vocab_size,
|
|
||||||
# initializer_range=self.initializer_range
|
|
||||||
)
|
|
||||||
|
|
||||||
head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
|
|
||||||
|
|
||||||
return (
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
head_mask,
|
|
||||||
token_type_ids,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
choice_labels,
|
|
||||||
)
|
|
||||||
|
|
||||||
def check_loss_output(self, result):
|
|
||||||
self.parent.assertListEqual(list(result["loss"].size()), [])
|
|
||||||
|
|
||||||
def create_and_check_openai_gpt_model(self, config, input_ids, head_mask, token_type_ids, *args):
|
|
||||||
model = OpenAIGPTModel(config=config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
|
|
||||||
model(input_ids, token_type_ids=token_type_ids)
|
|
||||||
(sequence_output,) = model(input_ids)
|
|
||||||
|
|
||||||
result = {"sequence_output": sequence_output}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size],
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
|
|
||||||
model = OpenAIGPTLMHeadModel(config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
loss, lm_logits = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
|
|
||||||
|
|
||||||
result = {"loss": loss, "lm_logits": lm_logits}
|
|
||||||
|
|
||||||
self.parent.assertListEqual(list(result["loss"].size()), [])
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size],
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
|
|
||||||
model = OpenAIGPTDoubleHeadsModel(config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
loss, lm_logits, mc_logits = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
|
|
||||||
|
|
||||||
result = {"loss": loss, "lm_logits": lm_logits}
|
|
||||||
|
|
||||||
self.parent.assertListEqual(list(result["loss"].size()), [])
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size],
|
|
||||||
)
|
|
||||||
|
|
||||||
def prepare_config_and_inputs_for_common(self):
|
|
||||||
config_and_inputs = self.prepare_config_and_inputs()
|
|
||||||
(
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
head_mask,
|
|
||||||
token_type_ids,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
choice_labels,
|
|
||||||
) = config_and_inputs
|
|
||||||
inputs_dict = {
|
|
||||||
"input_ids": input_ids,
|
|
||||||
"token_type_ids": token_type_ids,
|
|
||||||
"head_mask": head_mask,
|
|
||||||
}
|
|
||||||
|
|
||||||
return config, inputs_dict
|
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = OpenAIGPTModelTest.OpenAIGPTModelTester(self)
|
self.model_tester = OpenAIGPTModelTester(self)
|
||||||
self.config_tester = ConfigTester(self, config_class=OpenAIGPTConfig, n_embd=37)
|
self.config_tester = ConfigTester(self, config_class=OpenAIGPTConfig, n_embd=37)
|
||||||
|
|
||||||
def test_config(self):
|
def test_config(self):
|
||||||
|
|
|
@ -39,6 +39,183 @@ if is_torch_available():
|
||||||
from transformers.modeling_utils import create_position_ids_from_input_ids
|
from transformers.modeling_utils import create_position_ids_from_input_ids
|
||||||
|
|
||||||
|
|
||||||
|
class RobertaModelTester:
|
||||||
|
def __init__(
|
||||||
|
self, parent,
|
||||||
|
):
|
||||||
|
self.parent = parent
|
||||||
|
self.batch_size = 13
|
||||||
|
self.seq_length = 7
|
||||||
|
self.is_training = True
|
||||||
|
self.use_input_mask = True
|
||||||
|
self.use_token_type_ids = True
|
||||||
|
self.use_labels = True
|
||||||
|
self.vocab_size = 99
|
||||||
|
self.hidden_size = 32
|
||||||
|
self.num_hidden_layers = 5
|
||||||
|
self.num_attention_heads = 4
|
||||||
|
self.intermediate_size = 37
|
||||||
|
self.hidden_act = "gelu"
|
||||||
|
self.hidden_dropout_prob = 0.1
|
||||||
|
self.attention_probs_dropout_prob = 0.1
|
||||||
|
self.max_position_embeddings = 512
|
||||||
|
self.type_vocab_size = 16
|
||||||
|
self.type_sequence_label_size = 2
|
||||||
|
self.initializer_range = 0.02
|
||||||
|
self.num_labels = 3
|
||||||
|
self.num_choices = 4
|
||||||
|
self.scope = None
|
||||||
|
|
||||||
|
def prepare_config_and_inputs(self):
|
||||||
|
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
|
||||||
|
input_mask = None
|
||||||
|
if self.use_input_mask:
|
||||||
|
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
|
||||||
|
|
||||||
|
token_type_ids = None
|
||||||
|
if self.use_token_type_ids:
|
||||||
|
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
||||||
|
|
||||||
|
sequence_labels = None
|
||||||
|
token_labels = None
|
||||||
|
choice_labels = None
|
||||||
|
if self.use_labels:
|
||||||
|
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||||
|
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
||||||
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
|
config = RobertaConfig(
|
||||||
|
vocab_size=self.vocab_size,
|
||||||
|
hidden_size=self.hidden_size,
|
||||||
|
num_hidden_layers=self.num_hidden_layers,
|
||||||
|
num_attention_heads=self.num_attention_heads,
|
||||||
|
intermediate_size=self.intermediate_size,
|
||||||
|
hidden_act=self.hidden_act,
|
||||||
|
hidden_dropout_prob=self.hidden_dropout_prob,
|
||||||
|
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
||||||
|
max_position_embeddings=self.max_position_embeddings,
|
||||||
|
type_vocab_size=self.type_vocab_size,
|
||||||
|
initializer_range=self.initializer_range,
|
||||||
|
)
|
||||||
|
|
||||||
|
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
|
||||||
|
def check_loss_output(self, result):
|
||||||
|
self.parent.assertListEqual(list(result["loss"].size()), [])
|
||||||
|
|
||||||
|
def create_and_check_roberta_model(
|
||||||
|
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
):
|
||||||
|
model = RobertaModel(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
|
||||||
|
sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
|
||||||
|
sequence_output, pooled_output = model(input_ids)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"sequence_output": sequence_output,
|
||||||
|
"pooled_output": pooled_output,
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
|
||||||
|
|
||||||
|
def create_and_check_roberta_for_masked_lm(
|
||||||
|
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
):
|
||||||
|
model = RobertaForMaskedLM(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
loss, prediction_scores = model(
|
||||||
|
input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels
|
||||||
|
)
|
||||||
|
result = {
|
||||||
|
"loss": loss,
|
||||||
|
"prediction_scores": prediction_scores,
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
|
||||||
|
)
|
||||||
|
self.check_loss_output(result)
|
||||||
|
|
||||||
|
def create_and_check_roberta_for_token_classification(
|
||||||
|
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
):
|
||||||
|
config.num_labels = self.num_labels
|
||||||
|
model = RobertaForTokenClassification(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
|
||||||
|
result = {
|
||||||
|
"loss": loss,
|
||||||
|
"logits": logits,
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels])
|
||||||
|
self.check_loss_output(result)
|
||||||
|
|
||||||
|
def create_and_check_roberta_for_multiple_choice(
|
||||||
|
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
):
|
||||||
|
config.num_choices = self.num_choices
|
||||||
|
model = RobertaForMultipleChoice(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
|
||||||
|
multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
|
||||||
|
multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
|
||||||
|
loss, logits = model(
|
||||||
|
multiple_choice_inputs_ids,
|
||||||
|
attention_mask=multiple_choice_input_mask,
|
||||||
|
token_type_ids=multiple_choice_token_type_ids,
|
||||||
|
labels=choice_labels,
|
||||||
|
)
|
||||||
|
result = {
|
||||||
|
"loss": loss,
|
||||||
|
"logits": logits,
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices])
|
||||||
|
self.check_loss_output(result)
|
||||||
|
|
||||||
|
def create_and_check_roberta_for_question_answering(
|
||||||
|
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
):
|
||||||
|
model = RobertaForQuestionAnswering(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
loss, start_logits, end_logits = model(
|
||||||
|
input_ids,
|
||||||
|
attention_mask=input_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
start_positions=sequence_labels,
|
||||||
|
end_positions=sequence_labels,
|
||||||
|
)
|
||||||
|
result = {
|
||||||
|
"loss": loss,
|
||||||
|
"start_logits": start_logits,
|
||||||
|
"end_logits": end_logits,
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length])
|
||||||
|
self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length])
|
||||||
|
self.check_loss_output(result)
|
||||||
|
|
||||||
|
def prepare_config_and_inputs_for_common(self):
|
||||||
|
config_and_inputs = self.prepare_config_and_inputs()
|
||||||
|
(
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
token_type_ids,
|
||||||
|
input_mask,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
choice_labels,
|
||||||
|
) = config_and_inputs
|
||||||
|
inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
|
||||||
|
return config, inputs_dict
|
||||||
|
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
class RobertaModelTest(ModelTesterMixin, unittest.TestCase):
|
class RobertaModelTest(ModelTesterMixin, unittest.TestCase):
|
||||||
|
|
||||||
|
@ -55,210 +232,8 @@ class RobertaModelTest(ModelTesterMixin, unittest.TestCase):
|
||||||
else ()
|
else ()
|
||||||
)
|
)
|
||||||
|
|
||||||
class RobertaModelTester(object):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
parent,
|
|
||||||
batch_size=13,
|
|
||||||
seq_length=7,
|
|
||||||
is_training=True,
|
|
||||||
use_input_mask=True,
|
|
||||||
use_token_type_ids=True,
|
|
||||||
use_labels=True,
|
|
||||||
vocab_size=99,
|
|
||||||
hidden_size=32,
|
|
||||||
num_hidden_layers=5,
|
|
||||||
num_attention_heads=4,
|
|
||||||
intermediate_size=37,
|
|
||||||
hidden_act="gelu",
|
|
||||||
hidden_dropout_prob=0.1,
|
|
||||||
attention_probs_dropout_prob=0.1,
|
|
||||||
max_position_embeddings=512,
|
|
||||||
type_vocab_size=16,
|
|
||||||
type_sequence_label_size=2,
|
|
||||||
initializer_range=0.02,
|
|
||||||
num_labels=3,
|
|
||||||
num_choices=4,
|
|
||||||
scope=None,
|
|
||||||
):
|
|
||||||
self.parent = parent
|
|
||||||
self.batch_size = batch_size
|
|
||||||
self.seq_length = seq_length
|
|
||||||
self.is_training = is_training
|
|
||||||
self.use_input_mask = use_input_mask
|
|
||||||
self.use_token_type_ids = use_token_type_ids
|
|
||||||
self.use_labels = use_labels
|
|
||||||
self.vocab_size = vocab_size
|
|
||||||
self.hidden_size = hidden_size
|
|
||||||
self.num_hidden_layers = num_hidden_layers
|
|
||||||
self.num_attention_heads = num_attention_heads
|
|
||||||
self.intermediate_size = intermediate_size
|
|
||||||
self.hidden_act = hidden_act
|
|
||||||
self.hidden_dropout_prob = hidden_dropout_prob
|
|
||||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
|
||||||
self.max_position_embeddings = max_position_embeddings
|
|
||||||
self.type_vocab_size = type_vocab_size
|
|
||||||
self.type_sequence_label_size = type_sequence_label_size
|
|
||||||
self.initializer_range = initializer_range
|
|
||||||
self.num_labels = num_labels
|
|
||||||
self.num_choices = num_choices
|
|
||||||
self.scope = scope
|
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
|
||||||
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
|
||||||
|
|
||||||
input_mask = None
|
|
||||||
if self.use_input_mask:
|
|
||||||
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
|
|
||||||
|
|
||||||
token_type_ids = None
|
|
||||||
if self.use_token_type_ids:
|
|
||||||
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
|
||||||
|
|
||||||
sequence_labels = None
|
|
||||||
token_labels = None
|
|
||||||
choice_labels = None
|
|
||||||
if self.use_labels:
|
|
||||||
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
|
||||||
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
|
||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
|
||||||
|
|
||||||
config = RobertaConfig(
|
|
||||||
vocab_size=self.vocab_size,
|
|
||||||
hidden_size=self.hidden_size,
|
|
||||||
num_hidden_layers=self.num_hidden_layers,
|
|
||||||
num_attention_heads=self.num_attention_heads,
|
|
||||||
intermediate_size=self.intermediate_size,
|
|
||||||
hidden_act=self.hidden_act,
|
|
||||||
hidden_dropout_prob=self.hidden_dropout_prob,
|
|
||||||
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
|
||||||
max_position_embeddings=self.max_position_embeddings,
|
|
||||||
type_vocab_size=self.type_vocab_size,
|
|
||||||
initializer_range=self.initializer_range,
|
|
||||||
)
|
|
||||||
|
|
||||||
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
|
|
||||||
def check_loss_output(self, result):
|
|
||||||
self.parent.assertListEqual(list(result["loss"].size()), [])
|
|
||||||
|
|
||||||
def create_and_check_roberta_model(
|
|
||||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
):
|
|
||||||
model = RobertaModel(config=config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
|
|
||||||
sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
|
|
||||||
sequence_output, pooled_output = model(input_ids)
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"sequence_output": sequence_output,
|
|
||||||
"pooled_output": pooled_output,
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
|
|
||||||
|
|
||||||
def create_and_check_roberta_for_masked_lm(
|
|
||||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
):
|
|
||||||
model = RobertaForMaskedLM(config=config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
loss, prediction_scores = model(
|
|
||||||
input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels
|
|
||||||
)
|
|
||||||
result = {
|
|
||||||
"loss": loss,
|
|
||||||
"prediction_scores": prediction_scores,
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
|
|
||||||
)
|
|
||||||
self.check_loss_output(result)
|
|
||||||
|
|
||||||
def create_and_check_roberta_for_token_classification(
|
|
||||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
):
|
|
||||||
config.num_labels = self.num_labels
|
|
||||||
model = RobertaForTokenClassification(config=config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
loss, logits = model(
|
|
||||||
input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels
|
|
||||||
)
|
|
||||||
result = {
|
|
||||||
"loss": loss,
|
|
||||||
"logits": logits,
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]
|
|
||||||
)
|
|
||||||
self.check_loss_output(result)
|
|
||||||
|
|
||||||
def create_and_check_roberta_for_multiple_choice(
|
|
||||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
):
|
|
||||||
config.num_choices = self.num_choices
|
|
||||||
model = RobertaForMultipleChoice(config=config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
|
|
||||||
multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
|
|
||||||
multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
|
|
||||||
loss, logits = model(
|
|
||||||
multiple_choice_inputs_ids,
|
|
||||||
attention_mask=multiple_choice_input_mask,
|
|
||||||
token_type_ids=multiple_choice_token_type_ids,
|
|
||||||
labels=choice_labels,
|
|
||||||
)
|
|
||||||
result = {
|
|
||||||
"loss": loss,
|
|
||||||
"logits": logits,
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices])
|
|
||||||
self.check_loss_output(result)
|
|
||||||
|
|
||||||
def create_and_check_roberta_for_question_answering(
|
|
||||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
):
|
|
||||||
model = RobertaForQuestionAnswering(config=config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
loss, start_logits, end_logits = model(
|
|
||||||
input_ids,
|
|
||||||
attention_mask=input_mask,
|
|
||||||
token_type_ids=token_type_ids,
|
|
||||||
start_positions=sequence_labels,
|
|
||||||
end_positions=sequence_labels,
|
|
||||||
)
|
|
||||||
result = {
|
|
||||||
"loss": loss,
|
|
||||||
"start_logits": start_logits,
|
|
||||||
"end_logits": end_logits,
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length])
|
|
||||||
self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length])
|
|
||||||
self.check_loss_output(result)
|
|
||||||
|
|
||||||
def prepare_config_and_inputs_for_common(self):
|
|
||||||
config_and_inputs = self.prepare_config_and_inputs()
|
|
||||||
(
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
token_type_ids,
|
|
||||||
input_mask,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
choice_labels,
|
|
||||||
) = config_and_inputs
|
|
||||||
inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
|
|
||||||
return config, inputs_dict
|
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = RobertaModelTest.RobertaModelTester(self)
|
self.model_tester = RobertaModelTester(self)
|
||||||
self.config_tester = ConfigTester(self, config_class=RobertaConfig, hidden_size=37)
|
self.config_tester = ConfigTester(self, config_class=RobertaConfig, hidden_size=37)
|
||||||
|
|
||||||
def test_config(self):
|
def test_config(self):
|
||||||
|
|
|
@ -30,6 +30,268 @@ if is_torch_available():
|
||||||
from transformers.tokenization_t5 import T5Tokenizer
|
from transformers.tokenization_t5 import T5Tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
class T5ModelTester:
|
||||||
|
def __init__(self, parent):
|
||||||
|
self.parent = parent
|
||||||
|
self.batch_size = 13
|
||||||
|
self.encoder_seq_length = 7
|
||||||
|
self.decoder_seq_length = 9
|
||||||
|
self.is_training = True
|
||||||
|
self.use_attention_mask = True
|
||||||
|
self.use_labels = True
|
||||||
|
self.vocab_size = 99
|
||||||
|
self.n_positions = 14
|
||||||
|
self.hidden_size = 32
|
||||||
|
self.num_hidden_layers = 5
|
||||||
|
self.num_attention_heads = 4
|
||||||
|
self.d_ff = 37
|
||||||
|
self.relative_attention_num_buckets = 8
|
||||||
|
self.dropout_rate = 0.1
|
||||||
|
self.initializer_factor = 0.002
|
||||||
|
self.eos_token_id = 1
|
||||||
|
self.pad_token_id = 0
|
||||||
|
self.decoder_start_token_id = 0
|
||||||
|
self.scope = None
|
||||||
|
|
||||||
|
def prepare_config_and_inputs(self):
|
||||||
|
input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
|
||||||
|
decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
|
||||||
|
|
||||||
|
attention_mask = None
|
||||||
|
decoder_attention_mask = None
|
||||||
|
if self.use_attention_mask:
|
||||||
|
attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
|
||||||
|
decoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
|
||||||
|
|
||||||
|
lm_labels = None
|
||||||
|
if self.use_labels:
|
||||||
|
lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
|
||||||
|
|
||||||
|
config = T5Config(
|
||||||
|
vocab_size=self.vocab_size,
|
||||||
|
n_positions=self.n_positions,
|
||||||
|
d_model=self.hidden_size,
|
||||||
|
d_ff=self.d_ff,
|
||||||
|
d_kv=self.hidden_size // self.num_attention_heads,
|
||||||
|
num_layers=self.num_hidden_layers,
|
||||||
|
num_heads=self.num_attention_heads,
|
||||||
|
relative_attention_num_buckets=self.relative_attention_num_buckets,
|
||||||
|
dropout_rate=self.dropout_rate,
|
||||||
|
initializer_factor=self.initializer_factor,
|
||||||
|
eos_token_id=self.eos_token_id,
|
||||||
|
bos_token_id=self.pad_token_id,
|
||||||
|
pad_token_id=self.pad_token_id,
|
||||||
|
decoder_start_token_id=self.decoder_start_token_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
return (
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
decoder_input_ids,
|
||||||
|
attention_mask,
|
||||||
|
decoder_attention_mask,
|
||||||
|
lm_labels,
|
||||||
|
)
|
||||||
|
|
||||||
|
def check_loss_output(self, result):
|
||||||
|
self.parent.assertListEqual(list(result["loss"].size()), [])
|
||||||
|
|
||||||
|
def check_prepare_lm_labels_via_shift_left(
|
||||||
|
self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
|
||||||
|
):
|
||||||
|
model = T5Model(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
# make sure that lm_labels are correctly padded from the right
|
||||||
|
lm_labels.masked_fill_((lm_labels == self.decoder_start_token_id), self.eos_token_id)
|
||||||
|
|
||||||
|
# add casaul pad token mask
|
||||||
|
triangular_mask = torch.tril(lm_labels.new_ones(lm_labels.shape)).logical_not()
|
||||||
|
lm_labels.masked_fill_(triangular_mask, self.pad_token_id)
|
||||||
|
decoder_input_ids = model._shift_right(lm_labels)
|
||||||
|
|
||||||
|
for i, (decoder_input_ids_slice, lm_labels_slice) in enumerate(zip(decoder_input_ids, lm_labels)):
|
||||||
|
# first item
|
||||||
|
self.parent.assertEqual(decoder_input_ids_slice[0].item(), self.decoder_start_token_id)
|
||||||
|
if i < decoder_input_ids_slice.shape[-1]:
|
||||||
|
if i < decoder_input_ids.shape[-1] - 1:
|
||||||
|
# items before diagonal
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
decoder_input_ids_slice[1 : i + 1].tolist(), lm_labels_slice[:i].tolist()
|
||||||
|
)
|
||||||
|
# pad items after diagonal
|
||||||
|
if i < decoder_input_ids.shape[-1] - 2:
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
decoder_input_ids_slice[i + 2 :].tolist(), lm_labels_slice[i + 1 : -1].tolist()
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# all items after square
|
||||||
|
self.parent.assertListEqual(decoder_input_ids_slice[1:].tolist(), lm_labels_slice[:-1].tolist())
|
||||||
|
|
||||||
|
def create_and_check_t5_model(
|
||||||
|
self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
|
||||||
|
):
|
||||||
|
model = T5Model(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
decoder_output, decoder_past, encoder_output = model(
|
||||||
|
input_ids=input_ids,
|
||||||
|
decoder_input_ids=decoder_input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
decoder_attention_mask=decoder_attention_mask,
|
||||||
|
)
|
||||||
|
decoder_output, decoder_past, encoder_output = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"encoder_output": encoder_output,
|
||||||
|
"decoder_output": decoder_output,
|
||||||
|
"decoder_past": decoder_past,
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["encoder_output"].size()), [self.batch_size, self.encoder_seq_length, self.hidden_size]
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["decoder_output"].size()), [self.batch_size, self.decoder_seq_length, self.hidden_size]
|
||||||
|
)
|
||||||
|
self.parent.assertEqual(len(decoder_past), 2)
|
||||||
|
# decoder_past[0] should correspond to encoder output
|
||||||
|
self.parent.assertTrue(torch.all(decoder_past[0][0] == encoder_output))
|
||||||
|
# There should be `num_layers` key value embeddings stored in decoder_past[1]
|
||||||
|
self.parent.assertEqual(len(decoder_past[1]), config.num_layers)
|
||||||
|
# There should be a self attn key, a self attn value, a cross attn key and a cross attn value stored in each decoder_past[1] tuple
|
||||||
|
self.parent.assertEqual(len(decoder_past[1][0]), 4)
|
||||||
|
|
||||||
|
def create_and_check_t5_with_lm_head(
|
||||||
|
self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
|
||||||
|
):
|
||||||
|
model = T5ForConditionalGeneration(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
outputs = model(
|
||||||
|
input_ids=input_ids,
|
||||||
|
decoder_input_ids=decoder_input_ids,
|
||||||
|
decoder_attention_mask=decoder_attention_mask,
|
||||||
|
labels=lm_labels,
|
||||||
|
)
|
||||||
|
loss, prediction_scores, _, _ = outputs
|
||||||
|
self.parent.assertEqual(len(outputs), 4)
|
||||||
|
result = {
|
||||||
|
"loss": loss,
|
||||||
|
"prediction_scores": prediction_scores,
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["prediction_scores"].size()), [self.batch_size, self.decoder_seq_length, self.vocab_size]
|
||||||
|
)
|
||||||
|
self.check_loss_output(result)
|
||||||
|
|
||||||
|
def create_and_check_t5_decoder_model_past(
|
||||||
|
self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
|
||||||
|
):
|
||||||
|
model = T5Model(config=config).get_decoder()
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
# first forward pass
|
||||||
|
output, past_key_value_states = model(input_ids, use_cache=True)
|
||||||
|
|
||||||
|
# create hypothetical next token and extent to next_input_ids
|
||||||
|
next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
|
||||||
|
|
||||||
|
# append to next input_ids and
|
||||||
|
next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
|
||||||
|
|
||||||
|
output_from_no_past = model(next_input_ids)[0]
|
||||||
|
output_from_past = model(next_tokens, past_key_value_states=past_key_value_states)[0]
|
||||||
|
|
||||||
|
# select random slice
|
||||||
|
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
|
||||||
|
output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
|
||||||
|
output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
|
||||||
|
|
||||||
|
# test that outputs are equal for slice
|
||||||
|
self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
|
||||||
|
|
||||||
|
def create_and_check_t5_decoder_model_attention_mask_past(
|
||||||
|
self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
|
||||||
|
):
|
||||||
|
model = T5Model(config=config).get_decoder()
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
# create attention mask
|
||||||
|
attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
|
||||||
|
|
||||||
|
half_seq_length = input_ids.shape[-1] // 2
|
||||||
|
attn_mask[:, half_seq_length:] = 0
|
||||||
|
|
||||||
|
# first forward pass
|
||||||
|
output, past_key_value_states = model(input_ids, attention_mask=attn_mask, use_cache=True)
|
||||||
|
|
||||||
|
# create hypothetical next token and extent to next_input_ids
|
||||||
|
next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
|
||||||
|
|
||||||
|
# change a random masked slice from input_ids
|
||||||
|
random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
|
||||||
|
random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
|
||||||
|
input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
|
||||||
|
|
||||||
|
# append to next input_ids and attn_mask
|
||||||
|
next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
|
||||||
|
attn_mask = torch.cat(
|
||||||
|
[attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)], dim=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
# get two different outputs
|
||||||
|
output_from_no_past = model(next_input_ids, attention_mask=attn_mask)[0]
|
||||||
|
output_from_past = model(next_tokens, past_key_value_states=past_key_value_states, attention_mask=attn_mask)[0]
|
||||||
|
|
||||||
|
# select random slice
|
||||||
|
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
|
||||||
|
output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
|
||||||
|
output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
|
||||||
|
|
||||||
|
# test that outputs are equal for slice
|
||||||
|
self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
|
||||||
|
|
||||||
|
def create_t5_and_check_t5_generate_with_past_key_value_states(
|
||||||
|
self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
|
||||||
|
):
|
||||||
|
model = T5ForConditionalGeneration(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
torch.manual_seed(0)
|
||||||
|
output_without_past_cache = model.generate(
|
||||||
|
input_ids[:1], num_beams=2, max_length=5, do_sample=True, use_cache=False
|
||||||
|
)
|
||||||
|
torch.manual_seed(0)
|
||||||
|
output_with_past_cache = model.generate(input_ids[:1], num_beams=2, max_length=5, do_sample=True)
|
||||||
|
self.parent.assertTrue(torch.all(output_with_past_cache == output_without_past_cache))
|
||||||
|
|
||||||
|
def create_and_check_t5_model_fp16_forward(
|
||||||
|
self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
|
||||||
|
):
|
||||||
|
model = T5Model(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.half()
|
||||||
|
model.eval()
|
||||||
|
output = model(input_ids, decoder_input_ids=input_ids, attention_mask=attention_mask)[0]
|
||||||
|
self.parent.assertFalse(torch.isnan(output).any().item())
|
||||||
|
|
||||||
|
def prepare_config_and_inputs_for_common(self):
|
||||||
|
config_and_inputs = self.prepare_config_and_inputs()
|
||||||
|
(config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,) = config_and_inputs
|
||||||
|
|
||||||
|
inputs_dict = {
|
||||||
|
"input_ids": input_ids,
|
||||||
|
"attention_mask": attention_mask,
|
||||||
|
"decoder_input_ids": decoder_input_ids,
|
||||||
|
"decoder_attention_mask": decoder_attention_mask,
|
||||||
|
"use_cache": False,
|
||||||
|
}
|
||||||
|
return config, inputs_dict
|
||||||
|
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
class T5ModelTest(ModelTesterMixin, unittest.TestCase):
|
class T5ModelTest(ModelTesterMixin, unittest.TestCase):
|
||||||
|
|
||||||
|
@ -40,302 +302,8 @@ class T5ModelTest(ModelTesterMixin, unittest.TestCase):
|
||||||
test_resize_embeddings = False
|
test_resize_embeddings = False
|
||||||
is_encoder_decoder = True
|
is_encoder_decoder = True
|
||||||
|
|
||||||
class T5ModelTester(object):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
parent,
|
|
||||||
batch_size=13,
|
|
||||||
encoder_seq_length=7,
|
|
||||||
decoder_seq_length=9,
|
|
||||||
is_training=True,
|
|
||||||
use_attention_mask=True,
|
|
||||||
use_labels=True,
|
|
||||||
vocab_size=99,
|
|
||||||
n_positions=14,
|
|
||||||
hidden_size=32,
|
|
||||||
num_hidden_layers=5,
|
|
||||||
num_attention_heads=4,
|
|
||||||
d_ff=37,
|
|
||||||
relative_attention_num_buckets=8,
|
|
||||||
dropout_rate=0.1,
|
|
||||||
initializer_factor=0.002,
|
|
||||||
eos_token_id=1,
|
|
||||||
pad_token_id=0,
|
|
||||||
decoder_start_token_id=0,
|
|
||||||
scope=None,
|
|
||||||
):
|
|
||||||
self.parent = parent
|
|
||||||
self.batch_size = batch_size
|
|
||||||
self.encoder_seq_length = encoder_seq_length
|
|
||||||
self.decoder_seq_length = decoder_seq_length
|
|
||||||
self.is_training = is_training
|
|
||||||
self.use_attention_mask = use_attention_mask
|
|
||||||
self.use_labels = use_labels
|
|
||||||
self.vocab_size = vocab_size
|
|
||||||
self.n_positions = n_positions
|
|
||||||
self.hidden_size = hidden_size
|
|
||||||
self.num_hidden_layers = num_hidden_layers
|
|
||||||
self.num_attention_heads = num_attention_heads
|
|
||||||
self.d_ff = d_ff
|
|
||||||
self.relative_attention_num_buckets = relative_attention_num_buckets
|
|
||||||
self.dropout_rate = dropout_rate
|
|
||||||
self.initializer_factor = initializer_factor
|
|
||||||
self.scope = scope
|
|
||||||
self.eos_token_id = eos_token_id
|
|
||||||
self.pad_token_id = pad_token_id
|
|
||||||
self.decoder_start_token_id = decoder_start_token_id
|
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
|
||||||
input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
|
|
||||||
decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
|
|
||||||
|
|
||||||
attention_mask = None
|
|
||||||
decoder_attention_mask = None
|
|
||||||
if self.use_attention_mask:
|
|
||||||
attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
|
|
||||||
decoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
|
|
||||||
|
|
||||||
lm_labels = None
|
|
||||||
if self.use_labels:
|
|
||||||
lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
|
|
||||||
|
|
||||||
config = T5Config(
|
|
||||||
vocab_size=self.vocab_size,
|
|
||||||
n_positions=self.n_positions,
|
|
||||||
d_model=self.hidden_size,
|
|
||||||
d_ff=self.d_ff,
|
|
||||||
d_kv=self.hidden_size // self.num_attention_heads,
|
|
||||||
num_layers=self.num_hidden_layers,
|
|
||||||
num_heads=self.num_attention_heads,
|
|
||||||
relative_attention_num_buckets=self.relative_attention_num_buckets,
|
|
||||||
dropout_rate=self.dropout_rate,
|
|
||||||
initializer_factor=self.initializer_factor,
|
|
||||||
eos_token_id=self.eos_token_id,
|
|
||||||
bos_token_id=self.pad_token_id,
|
|
||||||
pad_token_id=self.pad_token_id,
|
|
||||||
decoder_start_token_id=self.decoder_start_token_id,
|
|
||||||
)
|
|
||||||
|
|
||||||
return (
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
decoder_input_ids,
|
|
||||||
attention_mask,
|
|
||||||
decoder_attention_mask,
|
|
||||||
lm_labels,
|
|
||||||
)
|
|
||||||
|
|
||||||
def check_loss_output(self, result):
|
|
||||||
self.parent.assertListEqual(list(result["loss"].size()), [])
|
|
||||||
|
|
||||||
def check_prepare_lm_labels_via_shift_left(
|
|
||||||
self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
|
|
||||||
):
|
|
||||||
model = T5Model(config=config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
# make sure that lm_labels are correctly padded from the right
|
|
||||||
lm_labels.masked_fill_((lm_labels == self.decoder_start_token_id), self.eos_token_id)
|
|
||||||
|
|
||||||
# add casaul pad token mask
|
|
||||||
triangular_mask = torch.tril(lm_labels.new_ones(lm_labels.shape)).logical_not()
|
|
||||||
lm_labels.masked_fill_(triangular_mask, self.pad_token_id)
|
|
||||||
decoder_input_ids = model._shift_right(lm_labels)
|
|
||||||
|
|
||||||
for i, (decoder_input_ids_slice, lm_labels_slice) in enumerate(zip(decoder_input_ids, lm_labels)):
|
|
||||||
# first item
|
|
||||||
self.parent.assertEqual(decoder_input_ids_slice[0].item(), self.decoder_start_token_id)
|
|
||||||
if i < decoder_input_ids_slice.shape[-1]:
|
|
||||||
if i < decoder_input_ids.shape[-1] - 1:
|
|
||||||
# items before diagonal
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
decoder_input_ids_slice[1 : i + 1].tolist(), lm_labels_slice[:i].tolist()
|
|
||||||
)
|
|
||||||
# pad items after diagonal
|
|
||||||
if i < decoder_input_ids.shape[-1] - 2:
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
decoder_input_ids_slice[i + 2 :].tolist(), lm_labels_slice[i + 1 : -1].tolist()
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
# all items after square
|
|
||||||
self.parent.assertListEqual(decoder_input_ids_slice[1:].tolist(), lm_labels_slice[:-1].tolist())
|
|
||||||
|
|
||||||
def create_and_check_t5_model(
|
|
||||||
self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
|
|
||||||
):
|
|
||||||
model = T5Model(config=config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
decoder_output, decoder_past, encoder_output = model(
|
|
||||||
input_ids=input_ids,
|
|
||||||
decoder_input_ids=decoder_input_ids,
|
|
||||||
attention_mask=attention_mask,
|
|
||||||
decoder_attention_mask=decoder_attention_mask,
|
|
||||||
)
|
|
||||||
decoder_output, decoder_past, encoder_output = model(
|
|
||||||
input_ids=input_ids, decoder_input_ids=decoder_input_ids
|
|
||||||
)
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"encoder_output": encoder_output,
|
|
||||||
"decoder_output": decoder_output,
|
|
||||||
"decoder_past": decoder_past,
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["encoder_output"].size()), [self.batch_size, self.encoder_seq_length, self.hidden_size]
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["decoder_output"].size()), [self.batch_size, self.decoder_seq_length, self.hidden_size]
|
|
||||||
)
|
|
||||||
self.parent.assertEqual(len(decoder_past), 2)
|
|
||||||
# decoder_past[0] should correspond to encoder output
|
|
||||||
self.parent.assertTrue(torch.all(decoder_past[0][0] == encoder_output))
|
|
||||||
# There should be `num_layers` key value embeddings stored in decoder_past[1]
|
|
||||||
self.parent.assertEqual(len(decoder_past[1]), config.num_layers)
|
|
||||||
# There should be a self attn key, a self attn value, a cross attn key and a cross attn value stored in each decoder_past[1] tuple
|
|
||||||
self.parent.assertEqual(len(decoder_past[1][0]), 4)
|
|
||||||
|
|
||||||
def create_and_check_t5_with_lm_head(
|
|
||||||
self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
|
|
||||||
):
|
|
||||||
model = T5ForConditionalGeneration(config=config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
outputs = model(
|
|
||||||
input_ids=input_ids,
|
|
||||||
decoder_input_ids=decoder_input_ids,
|
|
||||||
decoder_attention_mask=decoder_attention_mask,
|
|
||||||
labels=lm_labels,
|
|
||||||
)
|
|
||||||
loss, prediction_scores, _, _ = outputs
|
|
||||||
self.parent.assertEqual(len(outputs), 4)
|
|
||||||
result = {
|
|
||||||
"loss": loss,
|
|
||||||
"prediction_scores": prediction_scores,
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["prediction_scores"].size()), [self.batch_size, self.decoder_seq_length, self.vocab_size]
|
|
||||||
)
|
|
||||||
self.check_loss_output(result)
|
|
||||||
|
|
||||||
def create_and_check_t5_decoder_model_past(
|
|
||||||
self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
|
|
||||||
):
|
|
||||||
model = T5Model(config=config).get_decoder()
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
# first forward pass
|
|
||||||
output, past_key_value_states = model(input_ids, use_cache=True)
|
|
||||||
|
|
||||||
# create hypothetical next token and extent to next_input_ids
|
|
||||||
next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
|
|
||||||
|
|
||||||
# append to next input_ids and
|
|
||||||
next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
|
|
||||||
|
|
||||||
output_from_no_past = model(next_input_ids)[0]
|
|
||||||
output_from_past = model(next_tokens, past_key_value_states=past_key_value_states)[0]
|
|
||||||
|
|
||||||
# select random slice
|
|
||||||
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
|
|
||||||
output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
|
|
||||||
output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
|
|
||||||
|
|
||||||
# test that outputs are equal for slice
|
|
||||||
self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
|
|
||||||
|
|
||||||
def create_and_check_t5_decoder_model_attention_mask_past(
|
|
||||||
self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
|
|
||||||
):
|
|
||||||
model = T5Model(config=config).get_decoder()
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
# create attention mask
|
|
||||||
attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
|
|
||||||
|
|
||||||
half_seq_length = input_ids.shape[-1] // 2
|
|
||||||
attn_mask[:, half_seq_length:] = 0
|
|
||||||
|
|
||||||
# first forward pass
|
|
||||||
output, past_key_value_states = model(input_ids, attention_mask=attn_mask, use_cache=True)
|
|
||||||
|
|
||||||
# create hypothetical next token and extent to next_input_ids
|
|
||||||
next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
|
|
||||||
|
|
||||||
# change a random masked slice from input_ids
|
|
||||||
random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
|
|
||||||
random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
|
|
||||||
input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
|
|
||||||
|
|
||||||
# append to next input_ids and attn_mask
|
|
||||||
next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
|
|
||||||
attn_mask = torch.cat(
|
|
||||||
[attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)], dim=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
# get two different outputs
|
|
||||||
output_from_no_past = model(next_input_ids, attention_mask=attn_mask)[0]
|
|
||||||
output_from_past = model(
|
|
||||||
next_tokens, past_key_value_states=past_key_value_states, attention_mask=attn_mask
|
|
||||||
)[0]
|
|
||||||
|
|
||||||
# select random slice
|
|
||||||
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
|
|
||||||
output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
|
|
||||||
output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
|
|
||||||
|
|
||||||
# test that outputs are equal for slice
|
|
||||||
self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
|
|
||||||
|
|
||||||
def create_t5_and_check_t5_generate_with_past_key_value_states(
|
|
||||||
self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
|
|
||||||
):
|
|
||||||
model = T5ForConditionalGeneration(config=config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
torch.manual_seed(0)
|
|
||||||
output_without_past_cache = model.generate(
|
|
||||||
input_ids[:1], num_beams=2, max_length=5, do_sample=True, use_cache=False
|
|
||||||
)
|
|
||||||
torch.manual_seed(0)
|
|
||||||
output_with_past_cache = model.generate(input_ids[:1], num_beams=2, max_length=5, do_sample=True)
|
|
||||||
self.parent.assertTrue(torch.all(output_with_past_cache == output_without_past_cache))
|
|
||||||
|
|
||||||
def create_and_check_t5_model_fp16_forward(
|
|
||||||
self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
|
|
||||||
):
|
|
||||||
model = T5Model(config=config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.half()
|
|
||||||
model.eval()
|
|
||||||
output = model(input_ids, decoder_input_ids=input_ids, attention_mask=attention_mask)[0]
|
|
||||||
self.parent.assertFalse(torch.isnan(output).any().item())
|
|
||||||
|
|
||||||
def prepare_config_and_inputs_for_common(self):
|
|
||||||
config_and_inputs = self.prepare_config_and_inputs()
|
|
||||||
(
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
decoder_input_ids,
|
|
||||||
attention_mask,
|
|
||||||
decoder_attention_mask,
|
|
||||||
lm_labels,
|
|
||||||
) = config_and_inputs
|
|
||||||
|
|
||||||
inputs_dict = {
|
|
||||||
"input_ids": input_ids,
|
|
||||||
"attention_mask": attention_mask,
|
|
||||||
"decoder_input_ids": decoder_input_ids,
|
|
||||||
"decoder_attention_mask": decoder_attention_mask,
|
|
||||||
"use_cache": False,
|
|
||||||
}
|
|
||||||
return config, inputs_dict
|
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = T5ModelTest.T5ModelTester(self)
|
self.model_tester = T5ModelTester(self)
|
||||||
self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
|
self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
|
||||||
|
|
||||||
def test_config(self):
|
def test_config(self):
|
||||||
|
|
|
@ -34,6 +34,186 @@ if is_tf_available():
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TFAlbertModelTester:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
parent,
|
||||||
|
batch_size=13,
|
||||||
|
seq_length=7,
|
||||||
|
is_training=True,
|
||||||
|
use_input_mask=True,
|
||||||
|
use_token_type_ids=True,
|
||||||
|
use_labels=True,
|
||||||
|
vocab_size=99,
|
||||||
|
embedding_size=16,
|
||||||
|
hidden_size=32,
|
||||||
|
num_hidden_layers=5,
|
||||||
|
num_attention_heads=4,
|
||||||
|
intermediate_size=37,
|
||||||
|
hidden_act="gelu",
|
||||||
|
hidden_dropout_prob=0.1,
|
||||||
|
attention_probs_dropout_prob=0.1,
|
||||||
|
max_position_embeddings=512,
|
||||||
|
type_vocab_size=16,
|
||||||
|
type_sequence_label_size=2,
|
||||||
|
initializer_range=0.02,
|
||||||
|
num_labels=3,
|
||||||
|
num_choices=4,
|
||||||
|
scope=None,
|
||||||
|
):
|
||||||
|
self.parent = parent
|
||||||
|
self.batch_size = 13
|
||||||
|
self.seq_length = 7
|
||||||
|
self.is_training = True
|
||||||
|
self.use_input_mask = True
|
||||||
|
self.use_token_type_ids = True
|
||||||
|
self.use_labels = True
|
||||||
|
self.vocab_size = 99
|
||||||
|
self.embedding_size = 16
|
||||||
|
self.hidden_size = 32
|
||||||
|
self.num_hidden_layers = 5
|
||||||
|
self.num_attention_heads = 4
|
||||||
|
self.intermediate_size = 37
|
||||||
|
self.hidden_act = "gelu"
|
||||||
|
self.hidden_dropout_prob = 0.1
|
||||||
|
self.attention_probs_dropout_prob = 0.1
|
||||||
|
self.max_position_embeddings = 512
|
||||||
|
self.type_vocab_size = 16
|
||||||
|
self.type_sequence_label_size = 2
|
||||||
|
self.initializer_range = 0.02
|
||||||
|
self.num_labels = 3
|
||||||
|
self.num_choices = 4
|
||||||
|
self.scope = None
|
||||||
|
|
||||||
|
def prepare_config_and_inputs(self):
|
||||||
|
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
|
||||||
|
input_mask = None
|
||||||
|
if self.use_input_mask:
|
||||||
|
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
|
||||||
|
|
||||||
|
token_type_ids = None
|
||||||
|
if self.use_token_type_ids:
|
||||||
|
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
||||||
|
|
||||||
|
sequence_labels = None
|
||||||
|
token_labels = None
|
||||||
|
choice_labels = None
|
||||||
|
if self.use_labels:
|
||||||
|
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||||
|
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
||||||
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
|
config = AlbertConfig(
|
||||||
|
vocab_size=self.vocab_size,
|
||||||
|
hidden_size=self.hidden_size,
|
||||||
|
num_hidden_layers=self.num_hidden_layers,
|
||||||
|
num_attention_heads=self.num_attention_heads,
|
||||||
|
intermediate_size=self.intermediate_size,
|
||||||
|
hidden_act=self.hidden_act,
|
||||||
|
hidden_dropout_prob=self.hidden_dropout_prob,
|
||||||
|
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
||||||
|
max_position_embeddings=self.max_position_embeddings,
|
||||||
|
type_vocab_size=self.type_vocab_size,
|
||||||
|
initializer_range=self.initializer_range,
|
||||||
|
)
|
||||||
|
|
||||||
|
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
|
||||||
|
def create_and_check_albert_model(
|
||||||
|
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
):
|
||||||
|
model = TFAlbertModel(config=config)
|
||||||
|
# inputs = {'input_ids': input_ids,
|
||||||
|
# 'attention_mask': input_mask,
|
||||||
|
# 'token_type_ids': token_type_ids}
|
||||||
|
# sequence_output, pooled_output = model(**inputs)
|
||||||
|
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
||||||
|
sequence_output, pooled_output = model(inputs)
|
||||||
|
|
||||||
|
inputs = [input_ids, input_mask]
|
||||||
|
sequence_output, pooled_output = model(inputs)
|
||||||
|
|
||||||
|
sequence_output, pooled_output = model(input_ids)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"sequence_output": sequence_output.numpy(),
|
||||||
|
"pooled_output": pooled_output.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size])
|
||||||
|
|
||||||
|
def create_and_check_albert_for_pretraining(
|
||||||
|
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
):
|
||||||
|
config.num_labels = self.num_labels
|
||||||
|
model = TFAlbertForPreTraining(config=config)
|
||||||
|
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
||||||
|
prediction_scores, sop_scores = model(inputs)
|
||||||
|
result = {
|
||||||
|
"prediction_scores": prediction_scores.numpy(),
|
||||||
|
"sop_scores": sop_scores.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(list(result["sop_scores"].shape), [self.batch_size, self.num_labels])
|
||||||
|
|
||||||
|
def create_and_check_albert_for_masked_lm(
|
||||||
|
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
):
|
||||||
|
model = TFAlbertForMaskedLM(config=config)
|
||||||
|
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
||||||
|
(prediction_scores,) = model(inputs)
|
||||||
|
result = {
|
||||||
|
"prediction_scores": prediction_scores.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_and_check_albert_for_sequence_classification(
|
||||||
|
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
):
|
||||||
|
config.num_labels = self.num_labels
|
||||||
|
model = TFAlbertForSequenceClassification(config=config)
|
||||||
|
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
||||||
|
(logits,) = model(inputs)
|
||||||
|
result = {
|
||||||
|
"logits": logits.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
|
||||||
|
|
||||||
|
def create_and_check_albert_for_question_answering(
|
||||||
|
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
):
|
||||||
|
model = TFAlbertForQuestionAnswering(config=config)
|
||||||
|
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
||||||
|
start_logits, end_logits = model(inputs)
|
||||||
|
result = {
|
||||||
|
"start_logits": start_logits.numpy(),
|
||||||
|
"end_logits": end_logits.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
|
||||||
|
self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
|
||||||
|
|
||||||
|
def prepare_config_and_inputs_for_common(self):
|
||||||
|
config_and_inputs = self.prepare_config_and_inputs()
|
||||||
|
(
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
token_type_ids,
|
||||||
|
input_mask,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
choice_labels,
|
||||||
|
) = config_and_inputs
|
||||||
|
inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
|
||||||
|
return config, inputs_dict
|
||||||
|
|
||||||
|
|
||||||
@require_tf
|
@require_tf
|
||||||
class TFAlbertModelTest(TFModelTesterMixin, unittest.TestCase):
|
class TFAlbertModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||||
|
|
||||||
|
@ -49,187 +229,8 @@ class TFAlbertModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||||
else ()
|
else ()
|
||||||
)
|
)
|
||||||
|
|
||||||
class TFAlbertModelTester(object):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
parent,
|
|
||||||
batch_size=13,
|
|
||||||
seq_length=7,
|
|
||||||
is_training=True,
|
|
||||||
use_input_mask=True,
|
|
||||||
use_token_type_ids=True,
|
|
||||||
use_labels=True,
|
|
||||||
vocab_size=99,
|
|
||||||
embedding_size=16,
|
|
||||||
hidden_size=32,
|
|
||||||
num_hidden_layers=5,
|
|
||||||
num_attention_heads=4,
|
|
||||||
intermediate_size=37,
|
|
||||||
hidden_act="gelu",
|
|
||||||
hidden_dropout_prob=0.1,
|
|
||||||
attention_probs_dropout_prob=0.1,
|
|
||||||
max_position_embeddings=512,
|
|
||||||
type_vocab_size=16,
|
|
||||||
type_sequence_label_size=2,
|
|
||||||
initializer_range=0.02,
|
|
||||||
num_labels=3,
|
|
||||||
num_choices=4,
|
|
||||||
scope=None,
|
|
||||||
):
|
|
||||||
self.parent = parent
|
|
||||||
self.batch_size = batch_size
|
|
||||||
self.seq_length = seq_length
|
|
||||||
self.is_training = is_training
|
|
||||||
self.use_input_mask = use_input_mask
|
|
||||||
self.use_token_type_ids = use_token_type_ids
|
|
||||||
self.use_labels = use_labels
|
|
||||||
self.vocab_size = vocab_size
|
|
||||||
self.embedding_size = embedding_size
|
|
||||||
self.hidden_size = hidden_size
|
|
||||||
self.num_hidden_layers = num_hidden_layers
|
|
||||||
self.num_attention_heads = num_attention_heads
|
|
||||||
self.intermediate_size = intermediate_size
|
|
||||||
self.hidden_act = hidden_act
|
|
||||||
self.hidden_dropout_prob = hidden_dropout_prob
|
|
||||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
|
||||||
self.max_position_embeddings = max_position_embeddings
|
|
||||||
self.type_vocab_size = type_vocab_size
|
|
||||||
self.type_sequence_label_size = type_sequence_label_size
|
|
||||||
self.initializer_range = initializer_range
|
|
||||||
self.num_labels = num_labels
|
|
||||||
self.num_choices = num_choices
|
|
||||||
self.scope = scope
|
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
|
||||||
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
|
||||||
|
|
||||||
input_mask = None
|
|
||||||
if self.use_input_mask:
|
|
||||||
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
|
|
||||||
|
|
||||||
token_type_ids = None
|
|
||||||
if self.use_token_type_ids:
|
|
||||||
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
|
||||||
|
|
||||||
sequence_labels = None
|
|
||||||
token_labels = None
|
|
||||||
choice_labels = None
|
|
||||||
if self.use_labels:
|
|
||||||
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
|
||||||
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
|
||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
|
||||||
|
|
||||||
config = AlbertConfig(
|
|
||||||
vocab_size=self.vocab_size,
|
|
||||||
hidden_size=self.hidden_size,
|
|
||||||
num_hidden_layers=self.num_hidden_layers,
|
|
||||||
num_attention_heads=self.num_attention_heads,
|
|
||||||
intermediate_size=self.intermediate_size,
|
|
||||||
hidden_act=self.hidden_act,
|
|
||||||
hidden_dropout_prob=self.hidden_dropout_prob,
|
|
||||||
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
|
||||||
max_position_embeddings=self.max_position_embeddings,
|
|
||||||
type_vocab_size=self.type_vocab_size,
|
|
||||||
initializer_range=self.initializer_range,
|
|
||||||
)
|
|
||||||
|
|
||||||
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
|
|
||||||
def create_and_check_albert_model(
|
|
||||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
):
|
|
||||||
model = TFAlbertModel(config=config)
|
|
||||||
# inputs = {'input_ids': input_ids,
|
|
||||||
# 'attention_mask': input_mask,
|
|
||||||
# 'token_type_ids': token_type_ids}
|
|
||||||
# sequence_output, pooled_output = model(**inputs)
|
|
||||||
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
|
||||||
sequence_output, pooled_output = model(inputs)
|
|
||||||
|
|
||||||
inputs = [input_ids, input_mask]
|
|
||||||
sequence_output, pooled_output = model(inputs)
|
|
||||||
|
|
||||||
sequence_output, pooled_output = model(input_ids)
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"sequence_output": sequence_output.numpy(),
|
|
||||||
"pooled_output": pooled_output.numpy(),
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size])
|
|
||||||
|
|
||||||
def create_and_check_albert_for_pretraining(
|
|
||||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
):
|
|
||||||
config.num_labels = self.num_labels
|
|
||||||
model = TFAlbertForPreTraining(config=config)
|
|
||||||
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
|
||||||
prediction_scores, sop_scores = model(inputs)
|
|
||||||
result = {
|
|
||||||
"prediction_scores": prediction_scores.numpy(),
|
|
||||||
"sop_scores": sop_scores.numpy(),
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(list(result["sop_scores"].shape), [self.batch_size, self.num_labels])
|
|
||||||
|
|
||||||
def create_and_check_albert_for_masked_lm(
|
|
||||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
):
|
|
||||||
model = TFAlbertForMaskedLM(config=config)
|
|
||||||
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
|
||||||
(prediction_scores,) = model(inputs)
|
|
||||||
result = {
|
|
||||||
"prediction_scores": prediction_scores.numpy(),
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_albert_for_sequence_classification(
|
|
||||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
):
|
|
||||||
config.num_labels = self.num_labels
|
|
||||||
model = TFAlbertForSequenceClassification(config=config)
|
|
||||||
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
|
||||||
(logits,) = model(inputs)
|
|
||||||
result = {
|
|
||||||
"logits": logits.numpy(),
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
|
|
||||||
|
|
||||||
def create_and_check_albert_for_question_answering(
|
|
||||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
):
|
|
||||||
model = TFAlbertForQuestionAnswering(config=config)
|
|
||||||
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
|
||||||
start_logits, end_logits = model(inputs)
|
|
||||||
result = {
|
|
||||||
"start_logits": start_logits.numpy(),
|
|
||||||
"end_logits": end_logits.numpy(),
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
|
|
||||||
self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
|
|
||||||
|
|
||||||
def prepare_config_and_inputs_for_common(self):
|
|
||||||
config_and_inputs = self.prepare_config_and_inputs()
|
|
||||||
(
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
token_type_ids,
|
|
||||||
input_mask,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
choice_labels,
|
|
||||||
) = config_and_inputs
|
|
||||||
inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
|
|
||||||
return config, inputs_dict
|
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = TFAlbertModelTest.TFAlbertModelTester(self)
|
self.model_tester = TFAlbertModelTester(self)
|
||||||
self.config_tester = ConfigTester(self, config_class=AlbertConfig, hidden_size=37)
|
self.config_tester = ConfigTester(self, config_class=AlbertConfig, hidden_size=37)
|
||||||
|
|
||||||
def test_config(self):
|
def test_config(self):
|
||||||
|
|
|
@ -37,6 +37,221 @@ if is_tf_available():
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TFBertModelTester:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
parent,
|
||||||
|
batch_size=13,
|
||||||
|
seq_length=7,
|
||||||
|
is_training=True,
|
||||||
|
use_input_mask=True,
|
||||||
|
use_token_type_ids=True,
|
||||||
|
use_labels=True,
|
||||||
|
vocab_size=99,
|
||||||
|
hidden_size=32,
|
||||||
|
num_hidden_layers=5,
|
||||||
|
num_attention_heads=4,
|
||||||
|
intermediate_size=37,
|
||||||
|
hidden_act="gelu",
|
||||||
|
hidden_dropout_prob=0.1,
|
||||||
|
attention_probs_dropout_prob=0.1,
|
||||||
|
max_position_embeddings=512,
|
||||||
|
type_vocab_size=16,
|
||||||
|
type_sequence_label_size=2,
|
||||||
|
initializer_range=0.02,
|
||||||
|
num_labels=3,
|
||||||
|
num_choices=4,
|
||||||
|
scope=None,
|
||||||
|
):
|
||||||
|
self.parent = parent
|
||||||
|
self.batch_size = 13
|
||||||
|
self.seq_length = 7
|
||||||
|
self.is_training = True
|
||||||
|
self.use_input_mask = True
|
||||||
|
self.use_token_type_ids = True
|
||||||
|
self.use_labels = True
|
||||||
|
self.vocab_size = 99
|
||||||
|
self.hidden_size = 32
|
||||||
|
self.num_hidden_layers = 5
|
||||||
|
self.num_attention_heads = 4
|
||||||
|
self.intermediate_size = 37
|
||||||
|
self.hidden_act = "gelu"
|
||||||
|
self.hidden_dropout_prob = 0.1
|
||||||
|
self.attention_probs_dropout_prob = 0.1
|
||||||
|
self.max_position_embeddings = 512
|
||||||
|
self.type_vocab_size = 16
|
||||||
|
self.type_sequence_label_size = 2
|
||||||
|
self.initializer_range = 0.02
|
||||||
|
self.num_labels = 3
|
||||||
|
self.num_choices = 4
|
||||||
|
self.scope = None
|
||||||
|
|
||||||
|
def prepare_config_and_inputs(self):
|
||||||
|
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
|
||||||
|
input_mask = None
|
||||||
|
if self.use_input_mask:
|
||||||
|
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
|
||||||
|
|
||||||
|
token_type_ids = None
|
||||||
|
if self.use_token_type_ids:
|
||||||
|
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
||||||
|
|
||||||
|
sequence_labels = None
|
||||||
|
token_labels = None
|
||||||
|
choice_labels = None
|
||||||
|
if self.use_labels:
|
||||||
|
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||||
|
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
||||||
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
|
config = BertConfig(
|
||||||
|
vocab_size=self.vocab_size,
|
||||||
|
hidden_size=self.hidden_size,
|
||||||
|
num_hidden_layers=self.num_hidden_layers,
|
||||||
|
num_attention_heads=self.num_attention_heads,
|
||||||
|
intermediate_size=self.intermediate_size,
|
||||||
|
hidden_act=self.hidden_act,
|
||||||
|
hidden_dropout_prob=self.hidden_dropout_prob,
|
||||||
|
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
||||||
|
max_position_embeddings=self.max_position_embeddings,
|
||||||
|
type_vocab_size=self.type_vocab_size,
|
||||||
|
initializer_range=self.initializer_range,
|
||||||
|
)
|
||||||
|
|
||||||
|
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
|
||||||
|
def create_and_check_bert_model(
|
||||||
|
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
):
|
||||||
|
model = TFBertModel(config=config)
|
||||||
|
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
||||||
|
sequence_output, pooled_output = model(inputs)
|
||||||
|
|
||||||
|
inputs = [input_ids, input_mask]
|
||||||
|
sequence_output, pooled_output = model(inputs)
|
||||||
|
|
||||||
|
sequence_output, pooled_output = model(input_ids)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"sequence_output": sequence_output.numpy(),
|
||||||
|
"pooled_output": pooled_output.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size])
|
||||||
|
|
||||||
|
def create_and_check_bert_for_masked_lm(
|
||||||
|
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
):
|
||||||
|
model = TFBertForMaskedLM(config=config)
|
||||||
|
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
||||||
|
(prediction_scores,) = model(inputs)
|
||||||
|
result = {
|
||||||
|
"prediction_scores": prediction_scores.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_and_check_bert_for_next_sequence_prediction(
|
||||||
|
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
):
|
||||||
|
model = TFBertForNextSentencePrediction(config=config)
|
||||||
|
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
||||||
|
(seq_relationship_score,) = model(inputs)
|
||||||
|
result = {
|
||||||
|
"seq_relationship_score": seq_relationship_score.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(list(result["seq_relationship_score"].shape), [self.batch_size, 2])
|
||||||
|
|
||||||
|
def create_and_check_bert_for_pretraining(
|
||||||
|
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
):
|
||||||
|
model = TFBertForPreTraining(config=config)
|
||||||
|
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
||||||
|
prediction_scores, seq_relationship_score = model(inputs)
|
||||||
|
result = {
|
||||||
|
"prediction_scores": prediction_scores.numpy(),
|
||||||
|
"seq_relationship_score": seq_relationship_score.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(list(result["seq_relationship_score"].shape), [self.batch_size, 2])
|
||||||
|
|
||||||
|
def create_and_check_bert_for_sequence_classification(
|
||||||
|
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
):
|
||||||
|
config.num_labels = self.num_labels
|
||||||
|
model = TFBertForSequenceClassification(config=config)
|
||||||
|
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
||||||
|
(logits,) = model(inputs)
|
||||||
|
result = {
|
||||||
|
"logits": logits.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
|
||||||
|
|
||||||
|
def create_and_check_bert_for_multiple_choice(
|
||||||
|
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
):
|
||||||
|
config.num_choices = self.num_choices
|
||||||
|
model = TFBertForMultipleChoice(config=config)
|
||||||
|
multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
|
||||||
|
multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
|
||||||
|
multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
|
||||||
|
inputs = {
|
||||||
|
"input_ids": multiple_choice_inputs_ids,
|
||||||
|
"attention_mask": multiple_choice_input_mask,
|
||||||
|
"token_type_ids": multiple_choice_token_type_ids,
|
||||||
|
}
|
||||||
|
(logits,) = model(inputs)
|
||||||
|
result = {
|
||||||
|
"logits": logits.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices])
|
||||||
|
|
||||||
|
def create_and_check_bert_for_token_classification(
|
||||||
|
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
):
|
||||||
|
config.num_labels = self.num_labels
|
||||||
|
model = TFBertForTokenClassification(config=config)
|
||||||
|
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
||||||
|
(logits,) = model(inputs)
|
||||||
|
result = {
|
||||||
|
"logits": logits.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels])
|
||||||
|
|
||||||
|
def create_and_check_bert_for_question_answering(
|
||||||
|
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
):
|
||||||
|
model = TFBertForQuestionAnswering(config=config)
|
||||||
|
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
||||||
|
start_logits, end_logits = model(inputs)
|
||||||
|
result = {
|
||||||
|
"start_logits": start_logits.numpy(),
|
||||||
|
"end_logits": end_logits.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
|
||||||
|
self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
|
||||||
|
|
||||||
|
def prepare_config_and_inputs_for_common(self):
|
||||||
|
config_and_inputs = self.prepare_config_and_inputs()
|
||||||
|
(
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
token_type_ids,
|
||||||
|
input_mask,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
choice_labels,
|
||||||
|
) = config_and_inputs
|
||||||
|
inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
|
||||||
|
return config, inputs_dict
|
||||||
|
|
||||||
|
|
||||||
@require_tf
|
@require_tf
|
||||||
class TFBertModelTest(TFModelTesterMixin, unittest.TestCase):
|
class TFBertModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||||
|
|
||||||
|
@ -55,224 +270,8 @@ class TFBertModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||||
else ()
|
else ()
|
||||||
)
|
)
|
||||||
|
|
||||||
class TFBertModelTester(object):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
parent,
|
|
||||||
batch_size=13,
|
|
||||||
seq_length=7,
|
|
||||||
is_training=True,
|
|
||||||
use_input_mask=True,
|
|
||||||
use_token_type_ids=True,
|
|
||||||
use_labels=True,
|
|
||||||
vocab_size=99,
|
|
||||||
hidden_size=32,
|
|
||||||
num_hidden_layers=5,
|
|
||||||
num_attention_heads=4,
|
|
||||||
intermediate_size=37,
|
|
||||||
hidden_act="gelu",
|
|
||||||
hidden_dropout_prob=0.1,
|
|
||||||
attention_probs_dropout_prob=0.1,
|
|
||||||
max_position_embeddings=512,
|
|
||||||
type_vocab_size=16,
|
|
||||||
type_sequence_label_size=2,
|
|
||||||
initializer_range=0.02,
|
|
||||||
num_labels=3,
|
|
||||||
num_choices=4,
|
|
||||||
scope=None,
|
|
||||||
):
|
|
||||||
self.parent = parent
|
|
||||||
self.batch_size = batch_size
|
|
||||||
self.seq_length = seq_length
|
|
||||||
self.is_training = is_training
|
|
||||||
self.use_input_mask = use_input_mask
|
|
||||||
self.use_token_type_ids = use_token_type_ids
|
|
||||||
self.use_labels = use_labels
|
|
||||||
self.vocab_size = vocab_size
|
|
||||||
self.hidden_size = hidden_size
|
|
||||||
self.num_hidden_layers = num_hidden_layers
|
|
||||||
self.num_attention_heads = num_attention_heads
|
|
||||||
self.intermediate_size = intermediate_size
|
|
||||||
self.hidden_act = hidden_act
|
|
||||||
self.hidden_dropout_prob = hidden_dropout_prob
|
|
||||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
|
||||||
self.max_position_embeddings = max_position_embeddings
|
|
||||||
self.type_vocab_size = type_vocab_size
|
|
||||||
self.type_sequence_label_size = type_sequence_label_size
|
|
||||||
self.initializer_range = initializer_range
|
|
||||||
self.num_labels = num_labels
|
|
||||||
self.num_choices = num_choices
|
|
||||||
self.scope = scope
|
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
|
||||||
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
|
||||||
|
|
||||||
input_mask = None
|
|
||||||
if self.use_input_mask:
|
|
||||||
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
|
|
||||||
|
|
||||||
token_type_ids = None
|
|
||||||
if self.use_token_type_ids:
|
|
||||||
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
|
||||||
|
|
||||||
sequence_labels = None
|
|
||||||
token_labels = None
|
|
||||||
choice_labels = None
|
|
||||||
if self.use_labels:
|
|
||||||
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
|
||||||
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
|
||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
|
||||||
|
|
||||||
config = BertConfig(
|
|
||||||
vocab_size=self.vocab_size,
|
|
||||||
hidden_size=self.hidden_size,
|
|
||||||
num_hidden_layers=self.num_hidden_layers,
|
|
||||||
num_attention_heads=self.num_attention_heads,
|
|
||||||
intermediate_size=self.intermediate_size,
|
|
||||||
hidden_act=self.hidden_act,
|
|
||||||
hidden_dropout_prob=self.hidden_dropout_prob,
|
|
||||||
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
|
||||||
max_position_embeddings=self.max_position_embeddings,
|
|
||||||
type_vocab_size=self.type_vocab_size,
|
|
||||||
initializer_range=self.initializer_range,
|
|
||||||
)
|
|
||||||
|
|
||||||
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
|
|
||||||
def create_and_check_bert_model(
|
|
||||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
):
|
|
||||||
model = TFBertModel(config=config)
|
|
||||||
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
|
||||||
sequence_output, pooled_output = model(inputs)
|
|
||||||
|
|
||||||
inputs = [input_ids, input_mask]
|
|
||||||
sequence_output, pooled_output = model(inputs)
|
|
||||||
|
|
||||||
sequence_output, pooled_output = model(input_ids)
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"sequence_output": sequence_output.numpy(),
|
|
||||||
"pooled_output": pooled_output.numpy(),
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size])
|
|
||||||
|
|
||||||
def create_and_check_bert_for_masked_lm(
|
|
||||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
):
|
|
||||||
model = TFBertForMaskedLM(config=config)
|
|
||||||
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
|
||||||
(prediction_scores,) = model(inputs)
|
|
||||||
result = {
|
|
||||||
"prediction_scores": prediction_scores.numpy(),
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_bert_for_next_sequence_prediction(
|
|
||||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
):
|
|
||||||
model = TFBertForNextSentencePrediction(config=config)
|
|
||||||
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
|
||||||
(seq_relationship_score,) = model(inputs)
|
|
||||||
result = {
|
|
||||||
"seq_relationship_score": seq_relationship_score.numpy(),
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(list(result["seq_relationship_score"].shape), [self.batch_size, 2])
|
|
||||||
|
|
||||||
def create_and_check_bert_for_pretraining(
|
|
||||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
):
|
|
||||||
model = TFBertForPreTraining(config=config)
|
|
||||||
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
|
||||||
prediction_scores, seq_relationship_score = model(inputs)
|
|
||||||
result = {
|
|
||||||
"prediction_scores": prediction_scores.numpy(),
|
|
||||||
"seq_relationship_score": seq_relationship_score.numpy(),
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(list(result["seq_relationship_score"].shape), [self.batch_size, 2])
|
|
||||||
|
|
||||||
def create_and_check_bert_for_sequence_classification(
|
|
||||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
):
|
|
||||||
config.num_labels = self.num_labels
|
|
||||||
model = TFBertForSequenceClassification(config=config)
|
|
||||||
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
|
||||||
(logits,) = model(inputs)
|
|
||||||
result = {
|
|
||||||
"logits": logits.numpy(),
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
|
|
||||||
|
|
||||||
def create_and_check_bert_for_multiple_choice(
|
|
||||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
):
|
|
||||||
config.num_choices = self.num_choices
|
|
||||||
model = TFBertForMultipleChoice(config=config)
|
|
||||||
multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
|
|
||||||
multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
|
|
||||||
multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
|
|
||||||
inputs = {
|
|
||||||
"input_ids": multiple_choice_inputs_ids,
|
|
||||||
"attention_mask": multiple_choice_input_mask,
|
|
||||||
"token_type_ids": multiple_choice_token_type_ids,
|
|
||||||
}
|
|
||||||
(logits,) = model(inputs)
|
|
||||||
result = {
|
|
||||||
"logits": logits.numpy(),
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices])
|
|
||||||
|
|
||||||
def create_and_check_bert_for_token_classification(
|
|
||||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
):
|
|
||||||
config.num_labels = self.num_labels
|
|
||||||
model = TFBertForTokenClassification(config=config)
|
|
||||||
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
|
||||||
(logits,) = model(inputs)
|
|
||||||
result = {
|
|
||||||
"logits": logits.numpy(),
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_bert_for_question_answering(
|
|
||||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
):
|
|
||||||
model = TFBertForQuestionAnswering(config=config)
|
|
||||||
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
|
||||||
start_logits, end_logits = model(inputs)
|
|
||||||
result = {
|
|
||||||
"start_logits": start_logits.numpy(),
|
|
||||||
"end_logits": end_logits.numpy(),
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
|
|
||||||
self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
|
|
||||||
|
|
||||||
def prepare_config_and_inputs_for_common(self):
|
|
||||||
config_and_inputs = self.prepare_config_and_inputs()
|
|
||||||
(
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
token_type_ids,
|
|
||||||
input_mask,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
choice_labels,
|
|
||||||
) = config_and_inputs
|
|
||||||
inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
|
|
||||||
return config, inputs_dict
|
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = TFBertModelTest.TFBertModelTester(self)
|
self.model_tester = TFBertModelTester(self)
|
||||||
self.config_tester = ConfigTester(self, config_class=BertConfig, hidden_size=37)
|
self.config_tester = ConfigTester(self, config_class=BertConfig, hidden_size=37)
|
||||||
|
|
||||||
def test_config(self):
|
def test_config(self):
|
||||||
|
|
|
@ -28,163 +28,141 @@ if is_tf_available():
|
||||||
from transformers.modeling_tf_ctrl import TFCTRLModel, TFCTRLLMHeadModel, TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST
|
from transformers.modeling_tf_ctrl import TFCTRLModel, TFCTRLLMHeadModel, TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST
|
||||||
|
|
||||||
|
|
||||||
|
class TFCTRLModelTester(object):
|
||||||
|
def __init__(
|
||||||
|
self, parent,
|
||||||
|
):
|
||||||
|
self.parent = parent
|
||||||
|
self.batch_size = 13
|
||||||
|
self.seq_length = 7
|
||||||
|
self.is_training = True
|
||||||
|
self.use_token_type_ids = True
|
||||||
|
self.use_input_mask = True
|
||||||
|
self.use_labels = True
|
||||||
|
self.use_mc_token_ids = True
|
||||||
|
self.vocab_size = 99
|
||||||
|
self.hidden_size = 32
|
||||||
|
self.num_hidden_layers = 5
|
||||||
|
self.num_attention_heads = 4
|
||||||
|
self.intermediate_size = 37
|
||||||
|
self.hidden_act = "gelu"
|
||||||
|
self.hidden_dropout_prob = 0.1
|
||||||
|
self.attention_probs_dropout_prob = 0.1
|
||||||
|
self.max_position_embeddings = 512
|
||||||
|
self.type_vocab_size = 16
|
||||||
|
self.type_sequence_label_size = 2
|
||||||
|
self.initializer_range = 0.02
|
||||||
|
self.num_labels = 3
|
||||||
|
self.num_choices = 4
|
||||||
|
self.scope = None
|
||||||
|
|
||||||
|
def prepare_config_and_inputs(self):
|
||||||
|
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
|
||||||
|
input_mask = None
|
||||||
|
if self.use_input_mask:
|
||||||
|
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
|
||||||
|
|
||||||
|
token_type_ids = None
|
||||||
|
if self.use_token_type_ids:
|
||||||
|
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
||||||
|
|
||||||
|
mc_token_ids = None
|
||||||
|
if self.use_mc_token_ids:
|
||||||
|
mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
|
||||||
|
|
||||||
|
sequence_labels = None
|
||||||
|
token_labels = None
|
||||||
|
choice_labels = None
|
||||||
|
if self.use_labels:
|
||||||
|
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||||
|
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
||||||
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
|
config = CTRLConfig(
|
||||||
|
vocab_size=self.vocab_size,
|
||||||
|
n_embd=self.hidden_size,
|
||||||
|
n_layer=self.num_hidden_layers,
|
||||||
|
n_head=self.num_attention_heads,
|
||||||
|
# intermediate_size=self.intermediate_size,
|
||||||
|
# hidden_act=self.hidden_act,
|
||||||
|
# hidden_dropout_prob=self.hidden_dropout_prob,
|
||||||
|
# attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
||||||
|
n_positions=self.max_position_embeddings,
|
||||||
|
n_ctx=self.max_position_embeddings
|
||||||
|
# type_vocab_size=self.type_vocab_size,
|
||||||
|
# initializer_range=self.initializer_range
|
||||||
|
)
|
||||||
|
|
||||||
|
head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
|
||||||
|
|
||||||
|
return (
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
input_mask,
|
||||||
|
head_mask,
|
||||||
|
token_type_ids,
|
||||||
|
mc_token_ids,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
choice_labels,
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
||||||
|
model = TFCTRLModel(config=config)
|
||||||
|
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
||||||
|
sequence_output = model(inputs)[0]
|
||||||
|
|
||||||
|
inputs = [input_ids, None, input_mask] # None is the input for 'past'
|
||||||
|
sequence_output = model(inputs)[0]
|
||||||
|
|
||||||
|
sequence_output = model(input_ids)[0]
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"sequence_output": sequence_output.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_and_check_ctrl_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
||||||
|
model = TFCTRLLMHeadModel(config=config)
|
||||||
|
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
||||||
|
prediction_scores = model(inputs)[0]
|
||||||
|
result = {
|
||||||
|
"prediction_scores": prediction_scores.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
|
||||||
|
)
|
||||||
|
|
||||||
|
def prepare_config_and_inputs_for_common(self):
|
||||||
|
config_and_inputs = self.prepare_config_and_inputs()
|
||||||
|
|
||||||
|
(
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
input_mask,
|
||||||
|
head_mask,
|
||||||
|
token_type_ids,
|
||||||
|
mc_token_ids,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
choice_labels,
|
||||||
|
) = config_and_inputs
|
||||||
|
|
||||||
|
inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
|
||||||
|
return config, inputs_dict
|
||||||
|
|
||||||
|
|
||||||
@require_tf
|
@require_tf
|
||||||
class TFCTRLModelTest(TFModelTesterMixin, unittest.TestCase):
|
class TFCTRLModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||||
|
|
||||||
all_model_classes = (TFCTRLModel, TFCTRLLMHeadModel) if is_tf_available() else ()
|
all_model_classes = (TFCTRLModel, TFCTRLLMHeadModel) if is_tf_available() else ()
|
||||||
all_generative_model_classes = (TFCTRLLMHeadModel,) if is_tf_available() else ()
|
all_generative_model_classes = (TFCTRLLMHeadModel,) if is_tf_available() else ()
|
||||||
|
|
||||||
class TFCTRLModelTester(object):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
parent,
|
|
||||||
batch_size=13,
|
|
||||||
seq_length=7,
|
|
||||||
is_training=True,
|
|
||||||
use_token_type_ids=True,
|
|
||||||
use_input_mask=True,
|
|
||||||
use_labels=True,
|
|
||||||
use_mc_token_ids=True,
|
|
||||||
vocab_size=99,
|
|
||||||
hidden_size=32,
|
|
||||||
num_hidden_layers=5,
|
|
||||||
num_attention_heads=4,
|
|
||||||
intermediate_size=37,
|
|
||||||
hidden_act="gelu",
|
|
||||||
hidden_dropout_prob=0.1,
|
|
||||||
attention_probs_dropout_prob=0.1,
|
|
||||||
max_position_embeddings=512,
|
|
||||||
type_vocab_size=16,
|
|
||||||
type_sequence_label_size=2,
|
|
||||||
initializer_range=0.02,
|
|
||||||
num_labels=3,
|
|
||||||
num_choices=4,
|
|
||||||
scope=None,
|
|
||||||
):
|
|
||||||
self.parent = parent
|
|
||||||
self.batch_size = batch_size
|
|
||||||
self.seq_length = seq_length
|
|
||||||
self.is_training = is_training
|
|
||||||
self.use_token_type_ids = use_token_type_ids
|
|
||||||
self.use_input_mask = use_input_mask
|
|
||||||
self.use_labels = use_labels
|
|
||||||
self.use_mc_token_ids = use_mc_token_ids
|
|
||||||
self.vocab_size = vocab_size
|
|
||||||
self.hidden_size = hidden_size
|
|
||||||
self.num_hidden_layers = num_hidden_layers
|
|
||||||
self.num_attention_heads = num_attention_heads
|
|
||||||
self.intermediate_size = intermediate_size
|
|
||||||
self.hidden_act = hidden_act
|
|
||||||
self.hidden_dropout_prob = hidden_dropout_prob
|
|
||||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
|
||||||
self.max_position_embeddings = max_position_embeddings
|
|
||||||
self.type_vocab_size = type_vocab_size
|
|
||||||
self.type_sequence_label_size = type_sequence_label_size
|
|
||||||
self.initializer_range = initializer_range
|
|
||||||
self.num_labels = num_labels
|
|
||||||
self.num_choices = num_choices
|
|
||||||
self.scope = scope
|
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
|
||||||
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
|
||||||
|
|
||||||
input_mask = None
|
|
||||||
if self.use_input_mask:
|
|
||||||
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
|
|
||||||
|
|
||||||
token_type_ids = None
|
|
||||||
if self.use_token_type_ids:
|
|
||||||
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
|
||||||
|
|
||||||
mc_token_ids = None
|
|
||||||
if self.use_mc_token_ids:
|
|
||||||
mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
|
|
||||||
|
|
||||||
sequence_labels = None
|
|
||||||
token_labels = None
|
|
||||||
choice_labels = None
|
|
||||||
if self.use_labels:
|
|
||||||
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
|
||||||
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
|
||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
|
||||||
|
|
||||||
config = CTRLConfig(
|
|
||||||
vocab_size=self.vocab_size,
|
|
||||||
n_embd=self.hidden_size,
|
|
||||||
n_layer=self.num_hidden_layers,
|
|
||||||
n_head=self.num_attention_heads,
|
|
||||||
# intermediate_size=self.intermediate_size,
|
|
||||||
# hidden_act=self.hidden_act,
|
|
||||||
# hidden_dropout_prob=self.hidden_dropout_prob,
|
|
||||||
# attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
|
||||||
n_positions=self.max_position_embeddings,
|
|
||||||
n_ctx=self.max_position_embeddings
|
|
||||||
# type_vocab_size=self.type_vocab_size,
|
|
||||||
# initializer_range=self.initializer_range
|
|
||||||
)
|
|
||||||
|
|
||||||
head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
|
|
||||||
|
|
||||||
return (
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
input_mask,
|
|
||||||
head_mask,
|
|
||||||
token_type_ids,
|
|
||||||
mc_token_ids,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
choice_labels,
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
|
||||||
model = TFCTRLModel(config=config)
|
|
||||||
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
|
||||||
sequence_output = model(inputs)[0]
|
|
||||||
|
|
||||||
inputs = [input_ids, None, input_mask] # None is the input for 'past'
|
|
||||||
sequence_output = model(inputs)[0]
|
|
||||||
|
|
||||||
sequence_output = model(input_ids)[0]
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"sequence_output": sequence_output.numpy(),
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_ctrl_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
|
||||||
model = TFCTRLLMHeadModel(config=config)
|
|
||||||
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
|
||||||
prediction_scores = model(inputs)[0]
|
|
||||||
result = {
|
|
||||||
"prediction_scores": prediction_scores.numpy(),
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
|
|
||||||
)
|
|
||||||
|
|
||||||
def prepare_config_and_inputs_for_common(self):
|
|
||||||
config_and_inputs = self.prepare_config_and_inputs()
|
|
||||||
|
|
||||||
(
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
input_mask,
|
|
||||||
head_mask,
|
|
||||||
token_type_ids,
|
|
||||||
mc_token_ids,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
choice_labels,
|
|
||||||
) = config_and_inputs
|
|
||||||
|
|
||||||
inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
|
|
||||||
return config, inputs_dict
|
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = TFCTRLModelTest.TFCTRLModelTester(self)
|
self.model_tester = TFCTRLModelTester(self)
|
||||||
self.config_tester = ConfigTester(self, config_class=CTRLConfig, n_embd=37)
|
self.config_tester = ConfigTester(self, config_class=CTRLConfig, n_embd=37)
|
||||||
|
|
||||||
def test_config(self):
|
def test_config(self):
|
||||||
|
|
|
@ -32,6 +32,128 @@ if is_tf_available():
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TFDistilBertModelTester:
|
||||||
|
def __init__(
|
||||||
|
self, parent,
|
||||||
|
):
|
||||||
|
self.parent = parent
|
||||||
|
self.batch_size = 13
|
||||||
|
self.seq_length = 7
|
||||||
|
self.is_training = True
|
||||||
|
self.use_input_mask = True
|
||||||
|
self.use_token_type_ids = False
|
||||||
|
self.use_labels = True
|
||||||
|
self.vocab_size = 99
|
||||||
|
self.hidden_size = 32
|
||||||
|
self.num_hidden_layers = 5
|
||||||
|
self.num_attention_heads = 4
|
||||||
|
self.intermediate_size = 37
|
||||||
|
self.hidden_act = "gelu"
|
||||||
|
self.hidden_dropout_prob = 0.1
|
||||||
|
self.attention_probs_dropout_prob = 0.1
|
||||||
|
self.max_position_embeddings = 512
|
||||||
|
self.type_vocab_size = 16
|
||||||
|
self.type_sequence_label_size = 2
|
||||||
|
self.initializer_range = 0.02
|
||||||
|
self.num_labels = 3
|
||||||
|
self.num_choices = 4
|
||||||
|
self.scope = None
|
||||||
|
|
||||||
|
def prepare_config_and_inputs(self):
|
||||||
|
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
|
||||||
|
input_mask = None
|
||||||
|
if self.use_input_mask:
|
||||||
|
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
|
||||||
|
|
||||||
|
sequence_labels = None
|
||||||
|
token_labels = None
|
||||||
|
choice_labels = None
|
||||||
|
if self.use_labels:
|
||||||
|
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||||
|
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
||||||
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
|
config = DistilBertConfig(
|
||||||
|
vocab_size=self.vocab_size,
|
||||||
|
dim=self.hidden_size,
|
||||||
|
n_layers=self.num_hidden_layers,
|
||||||
|
n_heads=self.num_attention_heads,
|
||||||
|
hidden_dim=self.intermediate_size,
|
||||||
|
hidden_act=self.hidden_act,
|
||||||
|
dropout=self.hidden_dropout_prob,
|
||||||
|
attention_dropout=self.attention_probs_dropout_prob,
|
||||||
|
max_position_embeddings=self.max_position_embeddings,
|
||||||
|
initializer_range=self.initializer_range,
|
||||||
|
)
|
||||||
|
|
||||||
|
return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
|
||||||
|
def create_and_check_distilbert_model(
|
||||||
|
self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
):
|
||||||
|
model = TFDistilBertModel(config=config)
|
||||||
|
inputs = {"input_ids": input_ids, "attention_mask": input_mask}
|
||||||
|
|
||||||
|
outputs = model(inputs)
|
||||||
|
sequence_output = outputs[0]
|
||||||
|
|
||||||
|
inputs = [input_ids, input_mask]
|
||||||
|
|
||||||
|
(sequence_output,) = model(inputs)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"sequence_output": sequence_output.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_and_check_distilbert_for_masked_lm(
|
||||||
|
self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
):
|
||||||
|
model = TFDistilBertForMaskedLM(config=config)
|
||||||
|
inputs = {"input_ids": input_ids, "attention_mask": input_mask}
|
||||||
|
(prediction_scores,) = model(inputs)
|
||||||
|
result = {
|
||||||
|
"prediction_scores": prediction_scores.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_and_check_distilbert_for_question_answering(
|
||||||
|
self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
):
|
||||||
|
model = TFDistilBertForQuestionAnswering(config=config)
|
||||||
|
inputs = {"input_ids": input_ids, "attention_mask": input_mask}
|
||||||
|
start_logits, end_logits = model(inputs)
|
||||||
|
result = {
|
||||||
|
"start_logits": start_logits.numpy(),
|
||||||
|
"end_logits": end_logits.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
|
||||||
|
self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
|
||||||
|
|
||||||
|
def create_and_check_distilbert_for_sequence_classification(
|
||||||
|
self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
):
|
||||||
|
config.num_labels = self.num_labels
|
||||||
|
model = TFDistilBertForSequenceClassification(config)
|
||||||
|
inputs = {"input_ids": input_ids, "attention_mask": input_mask}
|
||||||
|
(logits,) = model(inputs)
|
||||||
|
result = {
|
||||||
|
"logits": logits.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
|
||||||
|
|
||||||
|
def prepare_config_and_inputs_for_common(self):
|
||||||
|
config_and_inputs = self.prepare_config_and_inputs()
|
||||||
|
(config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs
|
||||||
|
inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
|
||||||
|
return config, inputs_dict
|
||||||
|
|
||||||
|
|
||||||
@require_tf
|
@require_tf
|
||||||
class TFDistilBertModelTest(TFModelTesterMixin, unittest.TestCase):
|
class TFDistilBertModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||||
|
|
||||||
|
@ -50,151 +172,8 @@ class TFDistilBertModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||||
test_resize_embeddings = True
|
test_resize_embeddings = True
|
||||||
test_head_masking = True
|
test_head_masking = True
|
||||||
|
|
||||||
class TFDistilBertModelTester(object):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
parent,
|
|
||||||
batch_size=13,
|
|
||||||
seq_length=7,
|
|
||||||
is_training=True,
|
|
||||||
use_input_mask=True,
|
|
||||||
use_token_type_ids=False,
|
|
||||||
use_labels=True,
|
|
||||||
vocab_size=99,
|
|
||||||
hidden_size=32,
|
|
||||||
num_hidden_layers=5,
|
|
||||||
num_attention_heads=4,
|
|
||||||
intermediate_size=37,
|
|
||||||
hidden_act="gelu",
|
|
||||||
hidden_dropout_prob=0.1,
|
|
||||||
attention_probs_dropout_prob=0.1,
|
|
||||||
max_position_embeddings=512,
|
|
||||||
type_vocab_size=16,
|
|
||||||
type_sequence_label_size=2,
|
|
||||||
initializer_range=0.02,
|
|
||||||
num_labels=3,
|
|
||||||
num_choices=4,
|
|
||||||
scope=None,
|
|
||||||
):
|
|
||||||
self.parent = parent
|
|
||||||
self.batch_size = batch_size
|
|
||||||
self.seq_length = seq_length
|
|
||||||
self.is_training = is_training
|
|
||||||
self.use_input_mask = use_input_mask
|
|
||||||
self.use_token_type_ids = use_token_type_ids
|
|
||||||
self.use_labels = use_labels
|
|
||||||
self.vocab_size = vocab_size
|
|
||||||
self.hidden_size = hidden_size
|
|
||||||
self.num_hidden_layers = num_hidden_layers
|
|
||||||
self.num_attention_heads = num_attention_heads
|
|
||||||
self.intermediate_size = intermediate_size
|
|
||||||
self.hidden_act = hidden_act
|
|
||||||
self.hidden_dropout_prob = hidden_dropout_prob
|
|
||||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
|
||||||
self.max_position_embeddings = max_position_embeddings
|
|
||||||
self.type_vocab_size = type_vocab_size
|
|
||||||
self.type_sequence_label_size = type_sequence_label_size
|
|
||||||
self.initializer_range = initializer_range
|
|
||||||
self.num_labels = num_labels
|
|
||||||
self.num_choices = num_choices
|
|
||||||
self.scope = scope
|
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
|
||||||
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
|
||||||
|
|
||||||
input_mask = None
|
|
||||||
if self.use_input_mask:
|
|
||||||
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
|
|
||||||
|
|
||||||
sequence_labels = None
|
|
||||||
token_labels = None
|
|
||||||
choice_labels = None
|
|
||||||
if self.use_labels:
|
|
||||||
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
|
||||||
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
|
||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
|
||||||
|
|
||||||
config = DistilBertConfig(
|
|
||||||
vocab_size=self.vocab_size,
|
|
||||||
dim=self.hidden_size,
|
|
||||||
n_layers=self.num_hidden_layers,
|
|
||||||
n_heads=self.num_attention_heads,
|
|
||||||
hidden_dim=self.intermediate_size,
|
|
||||||
hidden_act=self.hidden_act,
|
|
||||||
dropout=self.hidden_dropout_prob,
|
|
||||||
attention_dropout=self.attention_probs_dropout_prob,
|
|
||||||
max_position_embeddings=self.max_position_embeddings,
|
|
||||||
initializer_range=self.initializer_range,
|
|
||||||
)
|
|
||||||
|
|
||||||
return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
|
|
||||||
def create_and_check_distilbert_model(
|
|
||||||
self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
):
|
|
||||||
model = TFDistilBertModel(config=config)
|
|
||||||
inputs = {"input_ids": input_ids, "attention_mask": input_mask}
|
|
||||||
|
|
||||||
outputs = model(inputs)
|
|
||||||
sequence_output = outputs[0]
|
|
||||||
|
|
||||||
inputs = [input_ids, input_mask]
|
|
||||||
|
|
||||||
(sequence_output,) = model(inputs)
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"sequence_output": sequence_output.numpy(),
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_distilbert_for_masked_lm(
|
|
||||||
self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
):
|
|
||||||
model = TFDistilBertForMaskedLM(config=config)
|
|
||||||
inputs = {"input_ids": input_ids, "attention_mask": input_mask}
|
|
||||||
(prediction_scores,) = model(inputs)
|
|
||||||
result = {
|
|
||||||
"prediction_scores": prediction_scores.numpy(),
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_distilbert_for_question_answering(
|
|
||||||
self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
):
|
|
||||||
model = TFDistilBertForQuestionAnswering(config=config)
|
|
||||||
inputs = {"input_ids": input_ids, "attention_mask": input_mask}
|
|
||||||
start_logits, end_logits = model(inputs)
|
|
||||||
result = {
|
|
||||||
"start_logits": start_logits.numpy(),
|
|
||||||
"end_logits": end_logits.numpy(),
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
|
|
||||||
self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
|
|
||||||
|
|
||||||
def create_and_check_distilbert_for_sequence_classification(
|
|
||||||
self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
):
|
|
||||||
config.num_labels = self.num_labels
|
|
||||||
model = TFDistilBertForSequenceClassification(config)
|
|
||||||
inputs = {"input_ids": input_ids, "attention_mask": input_mask}
|
|
||||||
(logits,) = model(inputs)
|
|
||||||
result = {
|
|
||||||
"logits": logits.numpy(),
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
|
|
||||||
|
|
||||||
def prepare_config_and_inputs_for_common(self):
|
|
||||||
config_and_inputs = self.prepare_config_and_inputs()
|
|
||||||
(config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs
|
|
||||||
inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
|
|
||||||
return config, inputs_dict
|
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = TFDistilBertModelTest.TFDistilBertModelTester(self)
|
self.model_tester = TFDistilBertModelTester(self)
|
||||||
self.config_tester = ConfigTester(self, config_class=DistilBertConfig, dim=37)
|
self.config_tester = ConfigTester(self, config_class=DistilBertConfig, dim=37)
|
||||||
|
|
||||||
def test_config(self):
|
def test_config(self):
|
||||||
|
|
|
@ -32,6 +32,138 @@ if is_tf_available():
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TFElectraModelTester:
|
||||||
|
def __init__(
|
||||||
|
self, parent,
|
||||||
|
):
|
||||||
|
self.parent = parent
|
||||||
|
self.batch_size = 13
|
||||||
|
self.seq_length = 7
|
||||||
|
self.is_training = True
|
||||||
|
self.use_input_mask = True
|
||||||
|
self.use_token_type_ids = True
|
||||||
|
self.use_labels = True
|
||||||
|
self.vocab_size = 99
|
||||||
|
self.hidden_size = 32
|
||||||
|
self.num_hidden_layers = 5
|
||||||
|
self.num_attention_heads = 4
|
||||||
|
self.intermediate_size = 37
|
||||||
|
self.hidden_act = "gelu"
|
||||||
|
self.hidden_dropout_prob = 0.1
|
||||||
|
self.attention_probs_dropout_prob = 0.1
|
||||||
|
self.max_position_embeddings = 512
|
||||||
|
self.type_vocab_size = 16
|
||||||
|
self.type_sequence_label_size = 2
|
||||||
|
self.initializer_range = 0.02
|
||||||
|
self.num_labels = 3
|
||||||
|
self.num_choices = 4
|
||||||
|
self.scope = None
|
||||||
|
|
||||||
|
def prepare_config_and_inputs(self):
|
||||||
|
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
|
||||||
|
input_mask = None
|
||||||
|
if self.use_input_mask:
|
||||||
|
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
|
||||||
|
|
||||||
|
token_type_ids = None
|
||||||
|
if self.use_token_type_ids:
|
||||||
|
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
||||||
|
|
||||||
|
sequence_labels = None
|
||||||
|
token_labels = None
|
||||||
|
choice_labels = None
|
||||||
|
if self.use_labels:
|
||||||
|
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||||
|
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
||||||
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
|
config = ElectraConfig(
|
||||||
|
vocab_size=self.vocab_size,
|
||||||
|
hidden_size=self.hidden_size,
|
||||||
|
num_hidden_layers=self.num_hidden_layers,
|
||||||
|
num_attention_heads=self.num_attention_heads,
|
||||||
|
intermediate_size=self.intermediate_size,
|
||||||
|
hidden_act=self.hidden_act,
|
||||||
|
hidden_dropout_prob=self.hidden_dropout_prob,
|
||||||
|
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
||||||
|
max_position_embeddings=self.max_position_embeddings,
|
||||||
|
type_vocab_size=self.type_vocab_size,
|
||||||
|
initializer_range=self.initializer_range,
|
||||||
|
)
|
||||||
|
|
||||||
|
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
|
||||||
|
def create_and_check_electra_model(
|
||||||
|
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
):
|
||||||
|
model = TFElectraModel(config=config)
|
||||||
|
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
||||||
|
(sequence_output,) = model(inputs)
|
||||||
|
|
||||||
|
inputs = [input_ids, input_mask]
|
||||||
|
(sequence_output,) = model(inputs)
|
||||||
|
|
||||||
|
(sequence_output,) = model(input_ids)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"sequence_output": sequence_output.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_and_check_electra_for_masked_lm(
|
||||||
|
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
):
|
||||||
|
model = TFElectraForMaskedLM(config=config)
|
||||||
|
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
||||||
|
(prediction_scores,) = model(inputs)
|
||||||
|
result = {
|
||||||
|
"prediction_scores": prediction_scores.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_and_check_electra_for_pretraining(
|
||||||
|
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
):
|
||||||
|
model = TFElectraForPreTraining(config=config)
|
||||||
|
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
||||||
|
(prediction_scores,) = model(inputs)
|
||||||
|
result = {
|
||||||
|
"prediction_scores": prediction_scores.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(list(result["prediction_scores"].shape), [self.batch_size, self.seq_length])
|
||||||
|
|
||||||
|
def create_and_check_electra_for_token_classification(
|
||||||
|
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
):
|
||||||
|
config.num_labels = self.num_labels
|
||||||
|
model = TFElectraForTokenClassification(config=config)
|
||||||
|
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
||||||
|
(logits,) = model(inputs)
|
||||||
|
result = {
|
||||||
|
"logits": logits.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels])
|
||||||
|
|
||||||
|
def prepare_config_and_inputs_for_common(self):
|
||||||
|
config_and_inputs = self.prepare_config_and_inputs()
|
||||||
|
(
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
token_type_ids,
|
||||||
|
input_mask,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
choice_labels,
|
||||||
|
) = config_and_inputs
|
||||||
|
inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
|
||||||
|
return config, inputs_dict
|
||||||
|
|
||||||
|
|
||||||
@require_tf
|
@require_tf
|
||||||
class TFElectraModelTest(TFModelTesterMixin, unittest.TestCase):
|
class TFElectraModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||||
|
|
||||||
|
@ -41,163 +173,8 @@ class TFElectraModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||||
else ()
|
else ()
|
||||||
)
|
)
|
||||||
|
|
||||||
class TFElectraModelTester(object):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
parent,
|
|
||||||
batch_size=13,
|
|
||||||
seq_length=7,
|
|
||||||
is_training=True,
|
|
||||||
use_input_mask=True,
|
|
||||||
use_token_type_ids=True,
|
|
||||||
use_labels=True,
|
|
||||||
vocab_size=99,
|
|
||||||
hidden_size=32,
|
|
||||||
num_hidden_layers=5,
|
|
||||||
num_attention_heads=4,
|
|
||||||
intermediate_size=37,
|
|
||||||
hidden_act="gelu",
|
|
||||||
hidden_dropout_prob=0.1,
|
|
||||||
attention_probs_dropout_prob=0.1,
|
|
||||||
max_position_embeddings=512,
|
|
||||||
type_vocab_size=16,
|
|
||||||
type_sequence_label_size=2,
|
|
||||||
initializer_range=0.02,
|
|
||||||
num_labels=3,
|
|
||||||
num_choices=4,
|
|
||||||
scope=None,
|
|
||||||
):
|
|
||||||
self.parent = parent
|
|
||||||
self.batch_size = batch_size
|
|
||||||
self.seq_length = seq_length
|
|
||||||
self.is_training = is_training
|
|
||||||
self.use_input_mask = use_input_mask
|
|
||||||
self.use_token_type_ids = use_token_type_ids
|
|
||||||
self.use_labels = use_labels
|
|
||||||
self.vocab_size = vocab_size
|
|
||||||
self.hidden_size = hidden_size
|
|
||||||
self.num_hidden_layers = num_hidden_layers
|
|
||||||
self.num_attention_heads = num_attention_heads
|
|
||||||
self.intermediate_size = intermediate_size
|
|
||||||
self.hidden_act = hidden_act
|
|
||||||
self.hidden_dropout_prob = hidden_dropout_prob
|
|
||||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
|
||||||
self.max_position_embeddings = max_position_embeddings
|
|
||||||
self.type_vocab_size = type_vocab_size
|
|
||||||
self.type_sequence_label_size = type_sequence_label_size
|
|
||||||
self.initializer_range = initializer_range
|
|
||||||
self.num_labels = num_labels
|
|
||||||
self.num_choices = num_choices
|
|
||||||
self.scope = scope
|
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
|
||||||
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
|
||||||
|
|
||||||
input_mask = None
|
|
||||||
if self.use_input_mask:
|
|
||||||
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
|
|
||||||
|
|
||||||
token_type_ids = None
|
|
||||||
if self.use_token_type_ids:
|
|
||||||
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
|
||||||
|
|
||||||
sequence_labels = None
|
|
||||||
token_labels = None
|
|
||||||
choice_labels = None
|
|
||||||
if self.use_labels:
|
|
||||||
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
|
||||||
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
|
||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
|
||||||
|
|
||||||
config = ElectraConfig(
|
|
||||||
vocab_size=self.vocab_size,
|
|
||||||
hidden_size=self.hidden_size,
|
|
||||||
num_hidden_layers=self.num_hidden_layers,
|
|
||||||
num_attention_heads=self.num_attention_heads,
|
|
||||||
intermediate_size=self.intermediate_size,
|
|
||||||
hidden_act=self.hidden_act,
|
|
||||||
hidden_dropout_prob=self.hidden_dropout_prob,
|
|
||||||
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
|
||||||
max_position_embeddings=self.max_position_embeddings,
|
|
||||||
type_vocab_size=self.type_vocab_size,
|
|
||||||
initializer_range=self.initializer_range,
|
|
||||||
)
|
|
||||||
|
|
||||||
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
|
|
||||||
def create_and_check_electra_model(
|
|
||||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
):
|
|
||||||
model = TFElectraModel(config=config)
|
|
||||||
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
|
||||||
(sequence_output,) = model(inputs)
|
|
||||||
|
|
||||||
inputs = [input_ids, input_mask]
|
|
||||||
(sequence_output,) = model(inputs)
|
|
||||||
|
|
||||||
(sequence_output,) = model(input_ids)
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"sequence_output": sequence_output.numpy(),
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_electra_for_masked_lm(
|
|
||||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
):
|
|
||||||
model = TFElectraForMaskedLM(config=config)
|
|
||||||
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
|
||||||
(prediction_scores,) = model(inputs)
|
|
||||||
result = {
|
|
||||||
"prediction_scores": prediction_scores.numpy(),
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_electra_for_pretraining(
|
|
||||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
):
|
|
||||||
model = TFElectraForPreTraining(config=config)
|
|
||||||
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
|
||||||
(prediction_scores,) = model(inputs)
|
|
||||||
result = {
|
|
||||||
"prediction_scores": prediction_scores.numpy(),
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(list(result["prediction_scores"].shape), [self.batch_size, self.seq_length])
|
|
||||||
|
|
||||||
def create_and_check_electra_for_token_classification(
|
|
||||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
):
|
|
||||||
config.num_labels = self.num_labels
|
|
||||||
model = TFElectraForTokenClassification(config=config)
|
|
||||||
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
|
||||||
(logits,) = model(inputs)
|
|
||||||
result = {
|
|
||||||
"logits": logits.numpy(),
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]
|
|
||||||
)
|
|
||||||
|
|
||||||
def prepare_config_and_inputs_for_common(self):
|
|
||||||
config_and_inputs = self.prepare_config_and_inputs()
|
|
||||||
(
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
token_type_ids,
|
|
||||||
input_mask,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
choice_labels,
|
|
||||||
) = config_and_inputs
|
|
||||||
inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
|
|
||||||
return config, inputs_dict
|
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = TFElectraModelTest.TFElectraModelTester(self)
|
self.model_tester = TFElectraModelTester(self)
|
||||||
self.config_tester = ConfigTester(self, config_class=ElectraConfig, hidden_size=37)
|
self.config_tester = ConfigTester(self, config_class=ElectraConfig, hidden_size=37)
|
||||||
|
|
||||||
def test_config(self):
|
def test_config(self):
|
||||||
|
|
|
@ -34,268 +34,246 @@ if is_tf_available():
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TFGPT2ModelTester:
|
||||||
|
def __init__(
|
||||||
|
self, parent,
|
||||||
|
):
|
||||||
|
self.parent = parent
|
||||||
|
self.batch_size = 13
|
||||||
|
self.seq_length = 7
|
||||||
|
self.is_training = True
|
||||||
|
self.use_token_type_ids = True
|
||||||
|
self.use_input_mask = True
|
||||||
|
self.use_labels = True
|
||||||
|
self.use_mc_token_ids = True
|
||||||
|
self.vocab_size = 99
|
||||||
|
self.hidden_size = 32
|
||||||
|
self.num_hidden_layers = 5
|
||||||
|
self.num_attention_heads = 4
|
||||||
|
self.intermediate_size = 37
|
||||||
|
self.hidden_act = "gelu"
|
||||||
|
self.hidden_dropout_prob = 0.1
|
||||||
|
self.attention_probs_dropout_prob = 0.1
|
||||||
|
self.max_position_embeddings = 512
|
||||||
|
self.type_vocab_size = 16
|
||||||
|
self.type_sequence_label_size = 2
|
||||||
|
self.initializer_range = 0.02
|
||||||
|
self.num_labels = 3
|
||||||
|
self.num_choices = 4
|
||||||
|
self.scope = None
|
||||||
|
self.bos_token_id = self.vocab_size - 1
|
||||||
|
self.eos_token_id = self.vocab_size - 1
|
||||||
|
|
||||||
|
def prepare_config_and_inputs(self):
|
||||||
|
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
|
||||||
|
input_mask = None
|
||||||
|
if self.use_input_mask:
|
||||||
|
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
|
||||||
|
|
||||||
|
token_type_ids = None
|
||||||
|
if self.use_token_type_ids:
|
||||||
|
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
||||||
|
|
||||||
|
mc_token_ids = None
|
||||||
|
if self.use_mc_token_ids:
|
||||||
|
mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
|
||||||
|
|
||||||
|
sequence_labels = None
|
||||||
|
token_labels = None
|
||||||
|
choice_labels = None
|
||||||
|
if self.use_labels:
|
||||||
|
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||||
|
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
||||||
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
|
config = GPT2Config(
|
||||||
|
vocab_size=self.vocab_size,
|
||||||
|
n_embd=self.hidden_size,
|
||||||
|
n_layer=self.num_hidden_layers,
|
||||||
|
n_head=self.num_attention_heads,
|
||||||
|
# intermediate_size=self.intermediate_size,
|
||||||
|
# hidden_act=self.hidden_act,
|
||||||
|
# hidden_dropout_prob=self.hidden_dropout_prob,
|
||||||
|
# attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
||||||
|
n_positions=self.max_position_embeddings,
|
||||||
|
n_ctx=self.max_position_embeddings,
|
||||||
|
# type_vocab_size=self.type_vocab_size,
|
||||||
|
# initializer_range=self.initializer_range
|
||||||
|
bos_token_id=self.bos_token_id,
|
||||||
|
eos_token_id=self.eos_token_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
|
||||||
|
|
||||||
|
return (
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
input_mask,
|
||||||
|
head_mask,
|
||||||
|
token_type_ids,
|
||||||
|
mc_token_ids,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
choice_labels,
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
||||||
|
model = TFGPT2Model(config=config)
|
||||||
|
inputs = {
|
||||||
|
"input_ids": input_ids,
|
||||||
|
"attention_mask": input_mask,
|
||||||
|
"token_type_ids": token_type_ids,
|
||||||
|
}
|
||||||
|
sequence_output = model(inputs)[0]
|
||||||
|
|
||||||
|
inputs = [input_ids, None, input_mask] # None is the input for 'past'
|
||||||
|
sequence_output = model(inputs)[0]
|
||||||
|
|
||||||
|
sequence_output = model(input_ids)[0]
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"sequence_output": sequence_output.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size],
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_and_check_gpt2_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
||||||
|
model = TFGPT2Model(config=config)
|
||||||
|
|
||||||
|
# first forward pass
|
||||||
|
output, past = model(input_ids, token_type_ids=token_type_ids)
|
||||||
|
|
||||||
|
# create hypothetical next token and extent to next_input_ids
|
||||||
|
next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
|
||||||
|
next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size)
|
||||||
|
|
||||||
|
# append to next input_ids and token_type_ids
|
||||||
|
next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
|
||||||
|
next_token_type_ids = tf.concat([token_type_ids, next_token_types], axis=-1)
|
||||||
|
|
||||||
|
output_from_no_past, _ = model(next_input_ids, token_type_ids=next_token_type_ids)
|
||||||
|
output_from_past, _ = model(next_tokens, token_type_ids=next_token_types, past=past)
|
||||||
|
|
||||||
|
# select random slice
|
||||||
|
random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1]))
|
||||||
|
output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
|
||||||
|
output_from_past_slice = output_from_past[:, 0, random_slice_idx]
|
||||||
|
|
||||||
|
# test that outputs are equal for slice
|
||||||
|
tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-6)
|
||||||
|
|
||||||
|
def create_and_check_gpt2_model_attention_mask_past(
|
||||||
|
self, config, input_ids, input_mask, head_mask, token_type_ids, *args
|
||||||
|
):
|
||||||
|
model = TFGPT2Model(config=config)
|
||||||
|
|
||||||
|
# create attention mask
|
||||||
|
half_seq_length = self.seq_length // 2
|
||||||
|
attn_mask_begin = tf.ones((self.batch_size, half_seq_length), dtype=tf.int32)
|
||||||
|
attn_mask_end = tf.zeros((self.batch_size, self.seq_length - half_seq_length), dtype=tf.int32)
|
||||||
|
attn_mask = tf.concat([attn_mask_begin, attn_mask_end], axis=1)
|
||||||
|
|
||||||
|
# first forward pass
|
||||||
|
output, past = model(input_ids, attention_mask=attn_mask)
|
||||||
|
|
||||||
|
# create hypothetical next token and extent to next_input_ids
|
||||||
|
next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
|
||||||
|
|
||||||
|
# change a random masked slice from input_ids
|
||||||
|
random_seq_idx_to_change = ids_tensor((1,), half_seq_length).numpy() + 1
|
||||||
|
random_other_next_tokens = ids_tensor((self.batch_size, self.seq_length), config.vocab_size)
|
||||||
|
vector_condition = tf.range(self.seq_length) == (self.seq_length - random_seq_idx_to_change)
|
||||||
|
condition = tf.transpose(
|
||||||
|
tf.broadcast_to(tf.expand_dims(vector_condition, -1), (self.seq_length, self.batch_size))
|
||||||
|
)
|
||||||
|
input_ids = tf.where(condition, random_other_next_tokens, input_ids)
|
||||||
|
|
||||||
|
# append to next input_ids and attn_mask
|
||||||
|
next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
|
||||||
|
attn_mask = tf.concat([attn_mask, tf.ones((shape_list(attn_mask)[0], 1), dtype=tf.int32)], axis=1)
|
||||||
|
|
||||||
|
# get two different outputs
|
||||||
|
output_from_no_past, _ = model(next_input_ids, attention_mask=attn_mask)
|
||||||
|
output_from_past, _ = model(next_tokens, past=past, attention_mask=attn_mask)
|
||||||
|
|
||||||
|
# select random slice
|
||||||
|
random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1]))
|
||||||
|
output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
|
||||||
|
output_from_past_slice = output_from_past[:, 0, random_slice_idx]
|
||||||
|
|
||||||
|
# test that outputs are equal for slice
|
||||||
|
tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-12)
|
||||||
|
|
||||||
|
def create_and_check_gpt2_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
||||||
|
model = TFGPT2LMHeadModel(config=config)
|
||||||
|
inputs = {
|
||||||
|
"input_ids": input_ids,
|
||||||
|
"attention_mask": input_mask,
|
||||||
|
"token_type_ids": token_type_ids,
|
||||||
|
}
|
||||||
|
prediction_scores = model(inputs)[0]
|
||||||
|
result = {
|
||||||
|
"prediction_scores": prediction_scores.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size],
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_and_check_gpt2_double_head(
|
||||||
|
self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args
|
||||||
|
):
|
||||||
|
model = TFGPT2DoubleHeadsModel(config=config)
|
||||||
|
|
||||||
|
multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
|
||||||
|
multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
|
||||||
|
multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
|
||||||
|
|
||||||
|
inputs = {
|
||||||
|
"input_ids": multiple_choice_inputs_ids,
|
||||||
|
"mc_token_ids": mc_token_ids,
|
||||||
|
"attention_mask": multiple_choice_input_mask,
|
||||||
|
"token_type_ids": multiple_choice_token_type_ids,
|
||||||
|
}
|
||||||
|
lm_logits, mc_logits = model(inputs)[:2]
|
||||||
|
result = {"lm_logits": lm_logits.numpy(), "mc_logits": mc_logits.numpy()}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["lm_logits"].shape), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size],
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(list(result["mc_logits"].shape), [self.batch_size, self.num_choices])
|
||||||
|
|
||||||
|
def prepare_config_and_inputs_for_common(self):
|
||||||
|
config_and_inputs = self.prepare_config_and_inputs()
|
||||||
|
|
||||||
|
(
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
input_mask,
|
||||||
|
head_mask,
|
||||||
|
token_type_ids,
|
||||||
|
mc_token_ids,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
choice_labels,
|
||||||
|
) = config_and_inputs
|
||||||
|
|
||||||
|
inputs_dict = {
|
||||||
|
"input_ids": input_ids,
|
||||||
|
"token_type_ids": token_type_ids,
|
||||||
|
"attention_mask": input_mask,
|
||||||
|
}
|
||||||
|
return config, inputs_dict
|
||||||
|
|
||||||
|
|
||||||
@require_tf
|
@require_tf
|
||||||
class TFGPT2ModelTest(TFModelTesterMixin, unittest.TestCase):
|
class TFGPT2ModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||||
|
|
||||||
all_model_classes = (TFGPT2Model, TFGPT2LMHeadModel, TFGPT2DoubleHeadsModel) if is_tf_available() else ()
|
all_model_classes = (TFGPT2Model, TFGPT2LMHeadModel, TFGPT2DoubleHeadsModel) if is_tf_available() else ()
|
||||||
all_generative_model_classes = (TFGPT2LMHeadModel,) if is_tf_available() else ()
|
all_generative_model_classes = (TFGPT2LMHeadModel,) if is_tf_available() else ()
|
||||||
|
|
||||||
class TFGPT2ModelTester(object):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
parent,
|
|
||||||
batch_size=13,
|
|
||||||
seq_length=7,
|
|
||||||
is_training=True,
|
|
||||||
use_token_type_ids=True,
|
|
||||||
use_input_mask=True,
|
|
||||||
use_labels=True,
|
|
||||||
use_mc_token_ids=True,
|
|
||||||
vocab_size=99,
|
|
||||||
hidden_size=32,
|
|
||||||
num_hidden_layers=5,
|
|
||||||
num_attention_heads=4,
|
|
||||||
intermediate_size=37,
|
|
||||||
hidden_act="gelu",
|
|
||||||
hidden_dropout_prob=0.1,
|
|
||||||
attention_probs_dropout_prob=0.1,
|
|
||||||
max_position_embeddings=512,
|
|
||||||
type_vocab_size=16,
|
|
||||||
type_sequence_label_size=2,
|
|
||||||
initializer_range=0.02,
|
|
||||||
num_labels=3,
|
|
||||||
num_choices=4,
|
|
||||||
scope=None,
|
|
||||||
):
|
|
||||||
self.parent = parent
|
|
||||||
self.batch_size = batch_size
|
|
||||||
self.seq_length = seq_length
|
|
||||||
self.is_training = is_training
|
|
||||||
self.use_token_type_ids = use_token_type_ids
|
|
||||||
self.use_input_mask = use_input_mask
|
|
||||||
self.use_labels = use_labels
|
|
||||||
self.use_mc_token_ids = use_mc_token_ids
|
|
||||||
self.vocab_size = vocab_size
|
|
||||||
self.hidden_size = hidden_size
|
|
||||||
self.num_hidden_layers = num_hidden_layers
|
|
||||||
self.num_attention_heads = num_attention_heads
|
|
||||||
self.intermediate_size = intermediate_size
|
|
||||||
self.hidden_act = hidden_act
|
|
||||||
self.hidden_dropout_prob = hidden_dropout_prob
|
|
||||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
|
||||||
self.max_position_embeddings = max_position_embeddings
|
|
||||||
self.type_vocab_size = type_vocab_size
|
|
||||||
self.type_sequence_label_size = type_sequence_label_size
|
|
||||||
self.initializer_range = initializer_range
|
|
||||||
self.num_labels = num_labels
|
|
||||||
self.num_choices = num_choices
|
|
||||||
self.scope = scope
|
|
||||||
self.bos_token_id = vocab_size - 1
|
|
||||||
self.eos_token_id = vocab_size - 1
|
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
|
||||||
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
|
||||||
|
|
||||||
input_mask = None
|
|
||||||
if self.use_input_mask:
|
|
||||||
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
|
|
||||||
|
|
||||||
token_type_ids = None
|
|
||||||
if self.use_token_type_ids:
|
|
||||||
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
|
||||||
|
|
||||||
mc_token_ids = None
|
|
||||||
if self.use_mc_token_ids:
|
|
||||||
mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
|
|
||||||
|
|
||||||
sequence_labels = None
|
|
||||||
token_labels = None
|
|
||||||
choice_labels = None
|
|
||||||
if self.use_labels:
|
|
||||||
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
|
||||||
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
|
||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
|
||||||
|
|
||||||
config = GPT2Config(
|
|
||||||
vocab_size=self.vocab_size,
|
|
||||||
n_embd=self.hidden_size,
|
|
||||||
n_layer=self.num_hidden_layers,
|
|
||||||
n_head=self.num_attention_heads,
|
|
||||||
# intermediate_size=self.intermediate_size,
|
|
||||||
# hidden_act=self.hidden_act,
|
|
||||||
# hidden_dropout_prob=self.hidden_dropout_prob,
|
|
||||||
# attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
|
||||||
n_positions=self.max_position_embeddings,
|
|
||||||
n_ctx=self.max_position_embeddings,
|
|
||||||
# type_vocab_size=self.type_vocab_size,
|
|
||||||
# initializer_range=self.initializer_range
|
|
||||||
bos_token_id=self.bos_token_id,
|
|
||||||
eos_token_id=self.eos_token_id,
|
|
||||||
)
|
|
||||||
|
|
||||||
head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
|
|
||||||
|
|
||||||
return (
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
input_mask,
|
|
||||||
head_mask,
|
|
||||||
token_type_ids,
|
|
||||||
mc_token_ids,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
choice_labels,
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
|
||||||
model = TFGPT2Model(config=config)
|
|
||||||
inputs = {
|
|
||||||
"input_ids": input_ids,
|
|
||||||
"attention_mask": input_mask,
|
|
||||||
"token_type_ids": token_type_ids,
|
|
||||||
}
|
|
||||||
sequence_output = model(inputs)[0]
|
|
||||||
|
|
||||||
inputs = [input_ids, None, input_mask] # None is the input for 'past'
|
|
||||||
sequence_output = model(inputs)[0]
|
|
||||||
|
|
||||||
sequence_output = model(input_ids)[0]
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"sequence_output": sequence_output.numpy(),
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size],
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_gpt2_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
|
||||||
model = TFGPT2Model(config=config)
|
|
||||||
|
|
||||||
# first forward pass
|
|
||||||
output, past = model(input_ids, token_type_ids=token_type_ids)
|
|
||||||
|
|
||||||
# create hypothetical next token and extent to next_input_ids
|
|
||||||
next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
|
|
||||||
next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size)
|
|
||||||
|
|
||||||
# append to next input_ids and token_type_ids
|
|
||||||
next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
|
|
||||||
next_token_type_ids = tf.concat([token_type_ids, next_token_types], axis=-1)
|
|
||||||
|
|
||||||
output_from_no_past, _ = model(next_input_ids, token_type_ids=next_token_type_ids)
|
|
||||||
output_from_past, _ = model(next_tokens, token_type_ids=next_token_types, past=past)
|
|
||||||
|
|
||||||
# select random slice
|
|
||||||
random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1]))
|
|
||||||
output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
|
|
||||||
output_from_past_slice = output_from_past[:, 0, random_slice_idx]
|
|
||||||
|
|
||||||
# test that outputs are equal for slice
|
|
||||||
tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-6)
|
|
||||||
|
|
||||||
def create_and_check_gpt2_model_attention_mask_past(
|
|
||||||
self, config, input_ids, input_mask, head_mask, token_type_ids, *args
|
|
||||||
):
|
|
||||||
model = TFGPT2Model(config=config)
|
|
||||||
|
|
||||||
# create attention mask
|
|
||||||
half_seq_length = self.seq_length // 2
|
|
||||||
attn_mask_begin = tf.ones((self.batch_size, half_seq_length), dtype=tf.int32)
|
|
||||||
attn_mask_end = tf.zeros((self.batch_size, self.seq_length - half_seq_length), dtype=tf.int32)
|
|
||||||
attn_mask = tf.concat([attn_mask_begin, attn_mask_end], axis=1)
|
|
||||||
|
|
||||||
# first forward pass
|
|
||||||
output, past = model(input_ids, attention_mask=attn_mask)
|
|
||||||
|
|
||||||
# create hypothetical next token and extent to next_input_ids
|
|
||||||
next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
|
|
||||||
|
|
||||||
# change a random masked slice from input_ids
|
|
||||||
random_seq_idx_to_change = ids_tensor((1,), half_seq_length).numpy() + 1
|
|
||||||
random_other_next_tokens = ids_tensor((self.batch_size, self.seq_length), config.vocab_size)
|
|
||||||
vector_condition = tf.range(self.seq_length) == (self.seq_length - random_seq_idx_to_change)
|
|
||||||
condition = tf.transpose(
|
|
||||||
tf.broadcast_to(tf.expand_dims(vector_condition, -1), (self.seq_length, self.batch_size))
|
|
||||||
)
|
|
||||||
input_ids = tf.where(condition, random_other_next_tokens, input_ids)
|
|
||||||
|
|
||||||
# append to next input_ids and attn_mask
|
|
||||||
next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
|
|
||||||
attn_mask = tf.concat([attn_mask, tf.ones((shape_list(attn_mask)[0], 1), dtype=tf.int32)], axis=1)
|
|
||||||
|
|
||||||
# get two different outputs
|
|
||||||
output_from_no_past, _ = model(next_input_ids, attention_mask=attn_mask)
|
|
||||||
output_from_past, _ = model(next_tokens, past=past, attention_mask=attn_mask)
|
|
||||||
|
|
||||||
# select random slice
|
|
||||||
random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1]))
|
|
||||||
output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
|
|
||||||
output_from_past_slice = output_from_past[:, 0, random_slice_idx]
|
|
||||||
|
|
||||||
# test that outputs are equal for slice
|
|
||||||
tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-12)
|
|
||||||
|
|
||||||
def create_and_check_gpt2_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
|
||||||
model = TFGPT2LMHeadModel(config=config)
|
|
||||||
inputs = {
|
|
||||||
"input_ids": input_ids,
|
|
||||||
"attention_mask": input_mask,
|
|
||||||
"token_type_ids": token_type_ids,
|
|
||||||
}
|
|
||||||
prediction_scores = model(inputs)[0]
|
|
||||||
result = {
|
|
||||||
"prediction_scores": prediction_scores.numpy(),
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size],
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_gpt2_double_head(
|
|
||||||
self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args
|
|
||||||
):
|
|
||||||
model = TFGPT2DoubleHeadsModel(config=config)
|
|
||||||
|
|
||||||
multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
|
|
||||||
multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
|
|
||||||
multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
|
|
||||||
|
|
||||||
inputs = {
|
|
||||||
"input_ids": multiple_choice_inputs_ids,
|
|
||||||
"mc_token_ids": mc_token_ids,
|
|
||||||
"attention_mask": multiple_choice_input_mask,
|
|
||||||
"token_type_ids": multiple_choice_token_type_ids,
|
|
||||||
}
|
|
||||||
lm_logits, mc_logits = model(inputs)[:2]
|
|
||||||
result = {"lm_logits": lm_logits.numpy(), "mc_logits": mc_logits.numpy()}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["lm_logits"].shape), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size],
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(list(result["mc_logits"].shape), [self.batch_size, self.num_choices])
|
|
||||||
|
|
||||||
def prepare_config_and_inputs_for_common(self):
|
|
||||||
config_and_inputs = self.prepare_config_and_inputs()
|
|
||||||
|
|
||||||
(
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
input_mask,
|
|
||||||
head_mask,
|
|
||||||
token_type_ids,
|
|
||||||
mc_token_ids,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
choice_labels,
|
|
||||||
) = config_and_inputs
|
|
||||||
|
|
||||||
inputs_dict = {
|
|
||||||
"input_ids": input_ids,
|
|
||||||
"token_type_ids": token_type_ids,
|
|
||||||
"attention_mask": input_mask,
|
|
||||||
}
|
|
||||||
return config, inputs_dict
|
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = TFGPT2ModelTest.TFGPT2ModelTester(self)
|
self.model_tester = TFGPT2ModelTester(self)
|
||||||
self.config_tester = ConfigTester(self, config_class=GPT2Config, n_embd=37)
|
self.config_tester = ConfigTester(self, config_class=GPT2Config, n_embd=37)
|
||||||
|
|
||||||
def test_config(self):
|
def test_config(self):
|
||||||
|
|
|
@ -33,6 +33,155 @@ if is_tf_available():
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TFOpenAIGPTModelTester:
|
||||||
|
def __init__(
|
||||||
|
self, parent,
|
||||||
|
):
|
||||||
|
self.parent = parent
|
||||||
|
self.batch_size = 13
|
||||||
|
self.seq_length = 7
|
||||||
|
self.is_training = True
|
||||||
|
self.use_token_type_ids = True
|
||||||
|
self.use_input_mask = True
|
||||||
|
self.use_labels = True
|
||||||
|
self.use_mc_token_ids = True
|
||||||
|
self.vocab_size = 99
|
||||||
|
self.hidden_size = 32
|
||||||
|
self.num_hidden_layers = 5
|
||||||
|
self.num_attention_heads = 4
|
||||||
|
self.intermediate_size = 37
|
||||||
|
self.hidden_act = "gelu"
|
||||||
|
self.hidden_dropout_prob = 0.1
|
||||||
|
self.attention_probs_dropout_prob = 0.1
|
||||||
|
self.max_position_embeddings = 512
|
||||||
|
self.type_vocab_size = 16
|
||||||
|
self.type_sequence_label_size = 2
|
||||||
|
self.initializer_range = 0.02
|
||||||
|
self.num_labels = 3
|
||||||
|
self.num_choices = 4
|
||||||
|
self.scope = None
|
||||||
|
|
||||||
|
def prepare_config_and_inputs(self):
|
||||||
|
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
|
||||||
|
input_mask = None
|
||||||
|
if self.use_input_mask:
|
||||||
|
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
|
||||||
|
|
||||||
|
token_type_ids = None
|
||||||
|
if self.use_token_type_ids:
|
||||||
|
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
||||||
|
|
||||||
|
mc_token_ids = None
|
||||||
|
if self.use_mc_token_ids:
|
||||||
|
mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
|
||||||
|
|
||||||
|
sequence_labels = None
|
||||||
|
token_labels = None
|
||||||
|
choice_labels = None
|
||||||
|
if self.use_labels:
|
||||||
|
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||||
|
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
||||||
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
|
config = OpenAIGPTConfig(
|
||||||
|
vocab_size=self.vocab_size,
|
||||||
|
n_embd=self.hidden_size,
|
||||||
|
n_layer=self.num_hidden_layers,
|
||||||
|
n_head=self.num_attention_heads,
|
||||||
|
# intermediate_size=self.intermediate_size,
|
||||||
|
# hidden_act=self.hidden_act,
|
||||||
|
# hidden_dropout_prob=self.hidden_dropout_prob,
|
||||||
|
# attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
||||||
|
n_positions=self.max_position_embeddings,
|
||||||
|
n_ctx=self.max_position_embeddings
|
||||||
|
# type_vocab_size=self.type_vocab_size,
|
||||||
|
# initializer_range=self.initializer_range
|
||||||
|
)
|
||||||
|
|
||||||
|
head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
|
||||||
|
|
||||||
|
return (
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
input_mask,
|
||||||
|
head_mask,
|
||||||
|
token_type_ids,
|
||||||
|
mc_token_ids,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
choice_labels,
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_and_check_openai_gpt_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
||||||
|
model = TFOpenAIGPTModel(config=config)
|
||||||
|
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
||||||
|
sequence_output = model(inputs)[0]
|
||||||
|
|
||||||
|
inputs = [input_ids, input_mask]
|
||||||
|
sequence_output = model(inputs)[0]
|
||||||
|
|
||||||
|
sequence_output = model(input_ids)[0]
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"sequence_output": sequence_output.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_and_check_openai_gpt_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
||||||
|
model = TFOpenAIGPTLMHeadModel(config=config)
|
||||||
|
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
||||||
|
prediction_scores = model(inputs)[0]
|
||||||
|
result = {
|
||||||
|
"prediction_scores": prediction_scores.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_and_check_openai_gpt_double_head(
|
||||||
|
self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args
|
||||||
|
):
|
||||||
|
model = TFOpenAIGPTDoubleHeadsModel(config=config)
|
||||||
|
|
||||||
|
multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
|
||||||
|
multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
|
||||||
|
multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
|
||||||
|
|
||||||
|
inputs = {
|
||||||
|
"input_ids": multiple_choice_inputs_ids,
|
||||||
|
"mc_token_ids": mc_token_ids,
|
||||||
|
"attention_mask": multiple_choice_input_mask,
|
||||||
|
"token_type_ids": multiple_choice_token_type_ids,
|
||||||
|
}
|
||||||
|
lm_logits, mc_logits = model(inputs)[:2]
|
||||||
|
result = {"lm_logits": lm_logits.numpy(), "mc_logits": mc_logits.numpy()}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["lm_logits"].shape), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size]
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(list(result["mc_logits"].shape), [self.batch_size, self.num_choices])
|
||||||
|
|
||||||
|
def prepare_config_and_inputs_for_common(self):
|
||||||
|
config_and_inputs = self.prepare_config_and_inputs()
|
||||||
|
|
||||||
|
(
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
input_mask,
|
||||||
|
head_mask,
|
||||||
|
token_type_ids,
|
||||||
|
mc_token_ids,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
choice_labels,
|
||||||
|
) = config_and_inputs
|
||||||
|
|
||||||
|
inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
|
||||||
|
return config, inputs_dict
|
||||||
|
|
||||||
|
|
||||||
@require_tf
|
@require_tf
|
||||||
class TFOpenAIGPTModelTest(TFModelTesterMixin, unittest.TestCase):
|
class TFOpenAIGPTModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||||
|
|
||||||
|
@ -43,179 +192,8 @@ class TFOpenAIGPTModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||||
(TFOpenAIGPTLMHeadModel,) if is_tf_available() else ()
|
(TFOpenAIGPTLMHeadModel,) if is_tf_available() else ()
|
||||||
) # TODO (PVP): Add Double HeadsModel when generate() function is changed accordingly
|
) # TODO (PVP): Add Double HeadsModel when generate() function is changed accordingly
|
||||||
|
|
||||||
class TFOpenAIGPTModelTester(object):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
parent,
|
|
||||||
batch_size=13,
|
|
||||||
seq_length=7,
|
|
||||||
is_training=True,
|
|
||||||
use_token_type_ids=True,
|
|
||||||
use_input_mask=True,
|
|
||||||
use_labels=True,
|
|
||||||
use_mc_token_ids=True,
|
|
||||||
vocab_size=99,
|
|
||||||
hidden_size=32,
|
|
||||||
num_hidden_layers=5,
|
|
||||||
num_attention_heads=4,
|
|
||||||
intermediate_size=37,
|
|
||||||
hidden_act="gelu",
|
|
||||||
hidden_dropout_prob=0.1,
|
|
||||||
attention_probs_dropout_prob=0.1,
|
|
||||||
max_position_embeddings=512,
|
|
||||||
type_vocab_size=16,
|
|
||||||
type_sequence_label_size=2,
|
|
||||||
initializer_range=0.02,
|
|
||||||
num_labels=3,
|
|
||||||
num_choices=4,
|
|
||||||
scope=None,
|
|
||||||
):
|
|
||||||
self.parent = parent
|
|
||||||
self.batch_size = batch_size
|
|
||||||
self.seq_length = seq_length
|
|
||||||
self.is_training = is_training
|
|
||||||
self.use_token_type_ids = use_token_type_ids
|
|
||||||
self.use_input_mask = use_input_mask
|
|
||||||
self.use_labels = use_labels
|
|
||||||
self.use_mc_token_ids = use_mc_token_ids
|
|
||||||
self.vocab_size = vocab_size
|
|
||||||
self.hidden_size = hidden_size
|
|
||||||
self.num_hidden_layers = num_hidden_layers
|
|
||||||
self.num_attention_heads = num_attention_heads
|
|
||||||
self.intermediate_size = intermediate_size
|
|
||||||
self.hidden_act = hidden_act
|
|
||||||
self.hidden_dropout_prob = hidden_dropout_prob
|
|
||||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
|
||||||
self.max_position_embeddings = max_position_embeddings
|
|
||||||
self.type_vocab_size = type_vocab_size
|
|
||||||
self.type_sequence_label_size = type_sequence_label_size
|
|
||||||
self.initializer_range = initializer_range
|
|
||||||
self.num_labels = num_labels
|
|
||||||
self.num_choices = num_choices
|
|
||||||
self.scope = scope
|
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
|
||||||
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
|
||||||
|
|
||||||
input_mask = None
|
|
||||||
if self.use_input_mask:
|
|
||||||
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
|
|
||||||
|
|
||||||
token_type_ids = None
|
|
||||||
if self.use_token_type_ids:
|
|
||||||
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
|
||||||
|
|
||||||
mc_token_ids = None
|
|
||||||
if self.use_mc_token_ids:
|
|
||||||
mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
|
|
||||||
|
|
||||||
sequence_labels = None
|
|
||||||
token_labels = None
|
|
||||||
choice_labels = None
|
|
||||||
if self.use_labels:
|
|
||||||
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
|
||||||
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
|
||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
|
||||||
|
|
||||||
config = OpenAIGPTConfig(
|
|
||||||
vocab_size=self.vocab_size,
|
|
||||||
n_embd=self.hidden_size,
|
|
||||||
n_layer=self.num_hidden_layers,
|
|
||||||
n_head=self.num_attention_heads,
|
|
||||||
# intermediate_size=self.intermediate_size,
|
|
||||||
# hidden_act=self.hidden_act,
|
|
||||||
# hidden_dropout_prob=self.hidden_dropout_prob,
|
|
||||||
# attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
|
||||||
n_positions=self.max_position_embeddings,
|
|
||||||
n_ctx=self.max_position_embeddings
|
|
||||||
# type_vocab_size=self.type_vocab_size,
|
|
||||||
# initializer_range=self.initializer_range
|
|
||||||
)
|
|
||||||
|
|
||||||
head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
|
|
||||||
|
|
||||||
return (
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
input_mask,
|
|
||||||
head_mask,
|
|
||||||
token_type_ids,
|
|
||||||
mc_token_ids,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
choice_labels,
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_openai_gpt_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
|
||||||
model = TFOpenAIGPTModel(config=config)
|
|
||||||
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
|
||||||
sequence_output = model(inputs)[0]
|
|
||||||
|
|
||||||
inputs = [input_ids, input_mask]
|
|
||||||
sequence_output = model(inputs)[0]
|
|
||||||
|
|
||||||
sequence_output = model(input_ids)[0]
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"sequence_output": sequence_output.numpy(),
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_openai_gpt_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
|
||||||
model = TFOpenAIGPTLMHeadModel(config=config)
|
|
||||||
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
|
||||||
prediction_scores = model(inputs)[0]
|
|
||||||
result = {
|
|
||||||
"prediction_scores": prediction_scores.numpy(),
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_openai_gpt_double_head(
|
|
||||||
self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args
|
|
||||||
):
|
|
||||||
model = TFOpenAIGPTDoubleHeadsModel(config=config)
|
|
||||||
|
|
||||||
multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
|
|
||||||
multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
|
|
||||||
multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
|
|
||||||
|
|
||||||
inputs = {
|
|
||||||
"input_ids": multiple_choice_inputs_ids,
|
|
||||||
"mc_token_ids": mc_token_ids,
|
|
||||||
"attention_mask": multiple_choice_input_mask,
|
|
||||||
"token_type_ids": multiple_choice_token_type_ids,
|
|
||||||
}
|
|
||||||
lm_logits, mc_logits = model(inputs)[:2]
|
|
||||||
result = {"lm_logits": lm_logits.numpy(), "mc_logits": mc_logits.numpy()}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["lm_logits"].shape), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size]
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(list(result["mc_logits"].shape), [self.batch_size, self.num_choices])
|
|
||||||
|
|
||||||
def prepare_config_and_inputs_for_common(self):
|
|
||||||
config_and_inputs = self.prepare_config_and_inputs()
|
|
||||||
|
|
||||||
(
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
input_mask,
|
|
||||||
head_mask,
|
|
||||||
token_type_ids,
|
|
||||||
mc_token_ids,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
choice_labels,
|
|
||||||
) = config_and_inputs
|
|
||||||
|
|
||||||
inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
|
|
||||||
return config, inputs_dict
|
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = TFOpenAIGPTModelTest.TFOpenAIGPTModelTester(self)
|
self.model_tester = TFOpenAIGPTModelTester(self)
|
||||||
self.config_tester = ConfigTester(self, config_class=OpenAIGPTConfig, n_embd=37)
|
self.config_tester = ConfigTester(self, config_class=OpenAIGPTConfig, n_embd=37)
|
||||||
|
|
||||||
def test_config(self):
|
def test_config(self):
|
||||||
|
|
|
@ -36,6 +36,139 @@ if is_tf_available():
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TFRobertaModelTester:
|
||||||
|
def __init__(
|
||||||
|
self, parent,
|
||||||
|
):
|
||||||
|
self.parent = parent
|
||||||
|
self.batch_size = 13
|
||||||
|
self.seq_length = 7
|
||||||
|
self.is_training = True
|
||||||
|
self.use_input_mask = True
|
||||||
|
self.use_token_type_ids = True
|
||||||
|
self.use_labels = True
|
||||||
|
self.vocab_size = 99
|
||||||
|
self.hidden_size = 32
|
||||||
|
self.num_hidden_layers = 5
|
||||||
|
self.num_attention_heads = 4
|
||||||
|
self.intermediate_size = 37
|
||||||
|
self.hidden_act = "gelu"
|
||||||
|
self.hidden_dropout_prob = 0.1
|
||||||
|
self.attention_probs_dropout_prob = 0.1
|
||||||
|
self.max_position_embeddings = 512
|
||||||
|
self.type_vocab_size = 16
|
||||||
|
self.type_sequence_label_size = 2
|
||||||
|
self.initializer_range = 0.02
|
||||||
|
self.num_labels = 3
|
||||||
|
self.num_choices = 4
|
||||||
|
self.scope = None
|
||||||
|
|
||||||
|
def prepare_config_and_inputs(self):
|
||||||
|
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
|
||||||
|
input_mask = None
|
||||||
|
if self.use_input_mask:
|
||||||
|
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
|
||||||
|
|
||||||
|
token_type_ids = None
|
||||||
|
if self.use_token_type_ids:
|
||||||
|
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
||||||
|
|
||||||
|
sequence_labels = None
|
||||||
|
token_labels = None
|
||||||
|
choice_labels = None
|
||||||
|
if self.use_labels:
|
||||||
|
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||||
|
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
||||||
|
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||||
|
|
||||||
|
config = RobertaConfig(
|
||||||
|
vocab_size=self.vocab_size,
|
||||||
|
hidden_size=self.hidden_size,
|
||||||
|
num_hidden_layers=self.num_hidden_layers,
|
||||||
|
num_attention_heads=self.num_attention_heads,
|
||||||
|
intermediate_size=self.intermediate_size,
|
||||||
|
hidden_act=self.hidden_act,
|
||||||
|
hidden_dropout_prob=self.hidden_dropout_prob,
|
||||||
|
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
||||||
|
max_position_embeddings=self.max_position_embeddings,
|
||||||
|
type_vocab_size=self.type_vocab_size,
|
||||||
|
initializer_range=self.initializer_range,
|
||||||
|
)
|
||||||
|
|
||||||
|
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
|
||||||
|
def create_and_check_roberta_model(
|
||||||
|
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
):
|
||||||
|
model = TFRobertaModel(config=config)
|
||||||
|
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
||||||
|
sequence_output = model(inputs)[0]
|
||||||
|
|
||||||
|
inputs = [input_ids, input_mask]
|
||||||
|
sequence_output = model(inputs)[0]
|
||||||
|
|
||||||
|
sequence_output = model(input_ids)[0]
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"sequence_output": sequence_output.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_and_check_roberta_for_masked_lm(
|
||||||
|
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
):
|
||||||
|
model = TFRobertaForMaskedLM(config=config)
|
||||||
|
prediction_scores = model([input_ids, input_mask, token_type_ids])[0]
|
||||||
|
result = {
|
||||||
|
"prediction_scores": prediction_scores.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_and_check_roberta_for_token_classification(
|
||||||
|
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
):
|
||||||
|
config.num_labels = self.num_labels
|
||||||
|
model = TFRobertaForTokenClassification(config=config)
|
||||||
|
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
||||||
|
(logits,) = model(inputs)
|
||||||
|
result = {
|
||||||
|
"logits": logits.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels])
|
||||||
|
|
||||||
|
def create_and_check_roberta_for_question_answering(
|
||||||
|
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||||
|
):
|
||||||
|
model = TFRobertaForQuestionAnswering(config=config)
|
||||||
|
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
||||||
|
start_logits, end_logits = model(inputs)
|
||||||
|
result = {
|
||||||
|
"start_logits": start_logits.numpy(),
|
||||||
|
"end_logits": end_logits.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
|
||||||
|
self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
|
||||||
|
|
||||||
|
def prepare_config_and_inputs_for_common(self):
|
||||||
|
config_and_inputs = self.prepare_config_and_inputs()
|
||||||
|
(
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
token_type_ids,
|
||||||
|
input_mask,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
choice_labels,
|
||||||
|
) = config_and_inputs
|
||||||
|
inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
|
||||||
|
return config, inputs_dict
|
||||||
|
|
||||||
|
|
||||||
@require_tf
|
@require_tf
|
||||||
class TFRobertaModelTest(TFModelTesterMixin, unittest.TestCase):
|
class TFRobertaModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||||
|
|
||||||
|
@ -51,164 +184,8 @@ class TFRobertaModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||||
else ()
|
else ()
|
||||||
)
|
)
|
||||||
|
|
||||||
class TFRobertaModelTester(object):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
parent,
|
|
||||||
batch_size=13,
|
|
||||||
seq_length=7,
|
|
||||||
is_training=True,
|
|
||||||
use_input_mask=True,
|
|
||||||
use_token_type_ids=True,
|
|
||||||
use_labels=True,
|
|
||||||
vocab_size=99,
|
|
||||||
hidden_size=32,
|
|
||||||
num_hidden_layers=5,
|
|
||||||
num_attention_heads=4,
|
|
||||||
intermediate_size=37,
|
|
||||||
hidden_act="gelu",
|
|
||||||
hidden_dropout_prob=0.1,
|
|
||||||
attention_probs_dropout_prob=0.1,
|
|
||||||
max_position_embeddings=512,
|
|
||||||
type_vocab_size=16,
|
|
||||||
type_sequence_label_size=2,
|
|
||||||
initializer_range=0.02,
|
|
||||||
num_labels=3,
|
|
||||||
num_choices=4,
|
|
||||||
scope=None,
|
|
||||||
):
|
|
||||||
self.parent = parent
|
|
||||||
self.batch_size = batch_size
|
|
||||||
self.seq_length = seq_length
|
|
||||||
self.is_training = is_training
|
|
||||||
self.use_input_mask = use_input_mask
|
|
||||||
self.use_token_type_ids = use_token_type_ids
|
|
||||||
self.use_labels = use_labels
|
|
||||||
self.vocab_size = vocab_size
|
|
||||||
self.hidden_size = hidden_size
|
|
||||||
self.num_hidden_layers = num_hidden_layers
|
|
||||||
self.num_attention_heads = num_attention_heads
|
|
||||||
self.intermediate_size = intermediate_size
|
|
||||||
self.hidden_act = hidden_act
|
|
||||||
self.hidden_dropout_prob = hidden_dropout_prob
|
|
||||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
|
||||||
self.max_position_embeddings = max_position_embeddings
|
|
||||||
self.type_vocab_size = type_vocab_size
|
|
||||||
self.type_sequence_label_size = type_sequence_label_size
|
|
||||||
self.initializer_range = initializer_range
|
|
||||||
self.num_labels = num_labels
|
|
||||||
self.num_choices = num_choices
|
|
||||||
self.scope = scope
|
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
|
||||||
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
|
||||||
|
|
||||||
input_mask = None
|
|
||||||
if self.use_input_mask:
|
|
||||||
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
|
|
||||||
|
|
||||||
token_type_ids = None
|
|
||||||
if self.use_token_type_ids:
|
|
||||||
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
|
||||||
|
|
||||||
sequence_labels = None
|
|
||||||
token_labels = None
|
|
||||||
choice_labels = None
|
|
||||||
if self.use_labels:
|
|
||||||
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
|
||||||
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
|
||||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
|
||||||
|
|
||||||
config = RobertaConfig(
|
|
||||||
vocab_size=self.vocab_size,
|
|
||||||
hidden_size=self.hidden_size,
|
|
||||||
num_hidden_layers=self.num_hidden_layers,
|
|
||||||
num_attention_heads=self.num_attention_heads,
|
|
||||||
intermediate_size=self.intermediate_size,
|
|
||||||
hidden_act=self.hidden_act,
|
|
||||||
hidden_dropout_prob=self.hidden_dropout_prob,
|
|
||||||
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
|
||||||
max_position_embeddings=self.max_position_embeddings,
|
|
||||||
type_vocab_size=self.type_vocab_size,
|
|
||||||
initializer_range=self.initializer_range,
|
|
||||||
)
|
|
||||||
|
|
||||||
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
|
|
||||||
def create_and_check_roberta_model(
|
|
||||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
):
|
|
||||||
model = TFRobertaModel(config=config)
|
|
||||||
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
|
||||||
sequence_output = model(inputs)[0]
|
|
||||||
|
|
||||||
inputs = [input_ids, input_mask]
|
|
||||||
sequence_output = model(inputs)[0]
|
|
||||||
|
|
||||||
sequence_output = model(input_ids)[0]
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"sequence_output": sequence_output.numpy(),
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_roberta_for_masked_lm(
|
|
||||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
):
|
|
||||||
model = TFRobertaForMaskedLM(config=config)
|
|
||||||
prediction_scores = model([input_ids, input_mask, token_type_ids])[0]
|
|
||||||
result = {
|
|
||||||
"prediction_scores": prediction_scores.numpy(),
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_roberta_for_token_classification(
|
|
||||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
):
|
|
||||||
config.num_labels = self.num_labels
|
|
||||||
model = TFRobertaForTokenClassification(config=config)
|
|
||||||
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
|
||||||
(logits,) = model(inputs)
|
|
||||||
result = {
|
|
||||||
"logits": logits.numpy(),
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_roberta_for_question_answering(
|
|
||||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
|
||||||
):
|
|
||||||
model = TFRobertaForQuestionAnswering(config=config)
|
|
||||||
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
|
|
||||||
start_logits, end_logits = model(inputs)
|
|
||||||
result = {
|
|
||||||
"start_logits": start_logits.numpy(),
|
|
||||||
"end_logits": end_logits.numpy(),
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
|
|
||||||
self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
|
|
||||||
|
|
||||||
def prepare_config_and_inputs_for_common(self):
|
|
||||||
config_and_inputs = self.prepare_config_and_inputs()
|
|
||||||
(
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
token_type_ids,
|
|
||||||
input_mask,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
choice_labels,
|
|
||||||
) = config_and_inputs
|
|
||||||
inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
|
|
||||||
return config, inputs_dict
|
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = TFRobertaModelTest.TFRobertaModelTester(self)
|
self.model_tester = TFRobertaModelTester(self)
|
||||||
self.config_tester = ConfigTester(self, config_class=RobertaConfig, hidden_size=37)
|
self.config_tester = ConfigTester(self, config_class=RobertaConfig, hidden_size=37)
|
||||||
|
|
||||||
def test_config(self):
|
def test_config(self):
|
||||||
|
|
|
@ -28,6 +28,186 @@ if is_tf_available():
|
||||||
from transformers import TFT5Model, TFT5ForConditionalGeneration, T5Tokenizer
|
from transformers import TFT5Model, TFT5ForConditionalGeneration, T5Tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
class TFT5ModelTester:
|
||||||
|
def __init__(
|
||||||
|
self, parent,
|
||||||
|
):
|
||||||
|
self.parent = parent
|
||||||
|
self.batch_size = 13
|
||||||
|
self.seq_length = 7
|
||||||
|
self.is_training = True
|
||||||
|
self.use_input_mask = True
|
||||||
|
self.use_labels = True
|
||||||
|
self.vocab_size = 99
|
||||||
|
self.n_positions = 14
|
||||||
|
self.hidden_size = 32
|
||||||
|
self.num_hidden_layers = 5
|
||||||
|
self.num_attention_heads = 4
|
||||||
|
self.d_ff = 37
|
||||||
|
self.relative_attention_num_buckets = 8
|
||||||
|
self.dropout_rate = 0.1
|
||||||
|
self.initializer_factor = 0.002
|
||||||
|
self.eos_token_id = 1
|
||||||
|
self.pad_token_id = 0
|
||||||
|
self.scope = None
|
||||||
|
|
||||||
|
def prepare_config_and_inputs(self):
|
||||||
|
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
|
||||||
|
input_mask = None
|
||||||
|
if self.use_input_mask:
|
||||||
|
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
|
||||||
|
|
||||||
|
token_labels = None
|
||||||
|
if self.use_labels:
|
||||||
|
token_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
|
||||||
|
config = T5Config(
|
||||||
|
vocab_size=self.vocab_size,
|
||||||
|
n_positions=self.n_positions,
|
||||||
|
d_model=self.hidden_size,
|
||||||
|
d_ff=self.d_ff,
|
||||||
|
d_kv=self.hidden_size // self.num_attention_heads,
|
||||||
|
num_layers=self.num_hidden_layers,
|
||||||
|
num_heads=self.num_attention_heads,
|
||||||
|
relative_attention_num_buckets=self.relative_attention_num_buckets,
|
||||||
|
dropout_rate=self.dropout_rate,
|
||||||
|
initializer_factor=self.initializer_factor,
|
||||||
|
eos_token_id=self.eos_token_id,
|
||||||
|
bos_token_id=self.pad_token_id,
|
||||||
|
pad_token_id=self.pad_token_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
return (config, input_ids, input_mask, token_labels)
|
||||||
|
|
||||||
|
def create_and_check_t5_model(self, config, input_ids, input_mask, token_labels):
|
||||||
|
model = TFT5Model(config=config)
|
||||||
|
inputs = {
|
||||||
|
"inputs": input_ids,
|
||||||
|
"decoder_input_ids": input_ids,
|
||||||
|
"decoder_attention_mask": input_mask,
|
||||||
|
}
|
||||||
|
decoder_output, decoder_past, encoder_output = model(inputs)
|
||||||
|
|
||||||
|
decoder_output, decoder_past, encoder_output = model(
|
||||||
|
input_ids, decoder_attention_mask=input_mask, decoder_input_ids=input_ids
|
||||||
|
)
|
||||||
|
result = {
|
||||||
|
"encoder_output": encoder_output.numpy(),
|
||||||
|
"decoder_past": decoder_past,
|
||||||
|
"decoder_output": decoder_output.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["encoder_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["decoder_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
|
||||||
|
)
|
||||||
|
self.parent.assertEqual(len(decoder_past), 2)
|
||||||
|
# decoder_past[0] should correspond to encoder output
|
||||||
|
self.parent.assertTrue(tf.reduce_all(tf.math.equal(decoder_past[0][0], encoder_output)))
|
||||||
|
# There should be `num_layers` key value embeddings stored in decoder_past[1]
|
||||||
|
self.parent.assertEqual(len(decoder_past[1]), config.num_layers)
|
||||||
|
# There should be a self attn key, a self attn value, a cross attn key and a cross attn value stored in each decoder_past[1] tuple
|
||||||
|
self.parent.assertEqual(len(decoder_past[1][0]), 4)
|
||||||
|
|
||||||
|
def create_and_check_t5_with_lm_head(self, config, input_ids, input_mask, token_labels):
|
||||||
|
model = TFT5ForConditionalGeneration(config=config)
|
||||||
|
inputs_dict = {
|
||||||
|
"inputs": input_ids,
|
||||||
|
"decoder_input_ids": input_ids,
|
||||||
|
"decoder_attention_mask": input_mask,
|
||||||
|
}
|
||||||
|
|
||||||
|
prediction_scores, _, _ = model(inputs_dict)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"prediction_scores": prediction_scores.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_and_check_t5_decoder_model_past(self, config, input_ids, decoder_input_ids, attention_mask):
|
||||||
|
model = TFT5Model(config=config).get_decoder()
|
||||||
|
|
||||||
|
input_ids = input_ids[:1, :]
|
||||||
|
self.batch_size = 1
|
||||||
|
|
||||||
|
# first forward pass
|
||||||
|
_, past_key_value_states = model(input_ids, use_cache=True)
|
||||||
|
|
||||||
|
# create hypothetical next token and extent to next_input_ids
|
||||||
|
next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
|
||||||
|
|
||||||
|
# append to next input_ids and
|
||||||
|
next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
|
||||||
|
|
||||||
|
output_from_no_past = model(next_input_ids)[0]
|
||||||
|
output_from_past = model(next_tokens, past_key_value_states=past_key_value_states)[0]
|
||||||
|
|
||||||
|
# select random slice
|
||||||
|
random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
|
||||||
|
output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
|
||||||
|
output_from_past_slice = output_from_past[:, 0, random_slice_idx]
|
||||||
|
|
||||||
|
# test that outputs are equal for slice
|
||||||
|
tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
|
||||||
|
|
||||||
|
def create_and_check_t5_decoder_model_attention_mask_past(
|
||||||
|
self, config, input_ids, decoder_input_ids, attention_mask
|
||||||
|
):
|
||||||
|
model = TFT5Model(config=config).get_decoder()
|
||||||
|
|
||||||
|
# create attention mask
|
||||||
|
half_seq_length = self.seq_length // 2
|
||||||
|
attn_mask_begin = tf.ones((self.batch_size, half_seq_length), dtype=tf.int32)
|
||||||
|
attn_mask_end = tf.zeros((self.batch_size, self.seq_length - half_seq_length), dtype=tf.int32)
|
||||||
|
attn_mask = tf.concat([attn_mask_begin, attn_mask_end], axis=1)
|
||||||
|
|
||||||
|
# first forward pass
|
||||||
|
_, past_key_value_states = model(input_ids, attention_mask=attn_mask, use_cache=True)
|
||||||
|
|
||||||
|
# create hypothetical next token and extent to next_input_ids
|
||||||
|
next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
|
||||||
|
|
||||||
|
# change a random masked slice from input_ids
|
||||||
|
random_seq_idx_to_change = ids_tensor((1,), half_seq_length).numpy() + 1
|
||||||
|
random_other_next_tokens = ids_tensor((self.batch_size, self.seq_length), config.vocab_size)
|
||||||
|
vector_condition = tf.range(self.seq_length) == (self.seq_length - random_seq_idx_to_change)
|
||||||
|
condition = tf.transpose(
|
||||||
|
tf.broadcast_to(tf.expand_dims(vector_condition, -1), (self.seq_length, self.batch_size))
|
||||||
|
)
|
||||||
|
input_ids = tf.where(condition, random_other_next_tokens, input_ids)
|
||||||
|
|
||||||
|
# append to next input_ids and attn_mask
|
||||||
|
next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
|
||||||
|
attn_mask = tf.concat([attn_mask, tf.ones((attn_mask.shape[0], 1), dtype=tf.int32)], axis=1,)
|
||||||
|
|
||||||
|
# get two different outputs
|
||||||
|
output_from_no_past = model(next_input_ids, attention_mask=attn_mask)[0]
|
||||||
|
output_from_past = model(next_tokens, past_key_value_states=past_key_value_states, attention_mask=attn_mask)[0]
|
||||||
|
|
||||||
|
# select random slice
|
||||||
|
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).numpy().item()
|
||||||
|
output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
|
||||||
|
output_from_past_slice = output_from_past[:, 0, random_slice_idx]
|
||||||
|
|
||||||
|
# test that outputs are equal for slice
|
||||||
|
tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
|
||||||
|
|
||||||
|
def prepare_config_and_inputs_for_common(self):
|
||||||
|
config_and_inputs = self.prepare_config_and_inputs()
|
||||||
|
(config, input_ids, input_mask, token_labels) = config_and_inputs
|
||||||
|
inputs_dict = {
|
||||||
|
"inputs": input_ids,
|
||||||
|
"decoder_input_ids": input_ids,
|
||||||
|
"decoder_attention_mask": input_mask,
|
||||||
|
"use_cache": tf.convert_to_tensor([False]),
|
||||||
|
}
|
||||||
|
return config, inputs_dict
|
||||||
|
|
||||||
|
|
||||||
@require_tf
|
@require_tf
|
||||||
class TFT5ModelTest(TFModelTesterMixin, unittest.TestCase):
|
class TFT5ModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||||
|
|
||||||
|
@ -35,207 +215,8 @@ class TFT5ModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||||
all_model_classes = (TFT5Model, TFT5ForConditionalGeneration) if is_tf_available() else ()
|
all_model_classes = (TFT5Model, TFT5ForConditionalGeneration) if is_tf_available() else ()
|
||||||
all_generative_model_classes = (TFT5ForConditionalGeneration,) if is_tf_available() else ()
|
all_generative_model_classes = (TFT5ForConditionalGeneration,) if is_tf_available() else ()
|
||||||
|
|
||||||
class TFT5ModelTester(object):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
parent,
|
|
||||||
batch_size=13,
|
|
||||||
seq_length=7,
|
|
||||||
is_training=True,
|
|
||||||
use_input_mask=True,
|
|
||||||
use_labels=True,
|
|
||||||
vocab_size=99,
|
|
||||||
n_positions=14,
|
|
||||||
hidden_size=32,
|
|
||||||
num_hidden_layers=5,
|
|
||||||
num_attention_heads=4,
|
|
||||||
d_ff=37,
|
|
||||||
relative_attention_num_buckets=8,
|
|
||||||
dropout_rate=0.1,
|
|
||||||
initializer_factor=0.002,
|
|
||||||
eos_token_id=1,
|
|
||||||
pad_token_id=0,
|
|
||||||
scope=None,
|
|
||||||
):
|
|
||||||
self.parent = parent
|
|
||||||
self.batch_size = batch_size
|
|
||||||
self.seq_length = seq_length
|
|
||||||
self.is_training = is_training
|
|
||||||
self.use_input_mask = use_input_mask
|
|
||||||
self.use_labels = use_labels
|
|
||||||
self.vocab_size = vocab_size
|
|
||||||
self.n_positions = n_positions
|
|
||||||
self.hidden_size = hidden_size
|
|
||||||
self.num_hidden_layers = num_hidden_layers
|
|
||||||
self.num_attention_heads = num_attention_heads
|
|
||||||
self.d_ff = d_ff
|
|
||||||
self.relative_attention_num_buckets = relative_attention_num_buckets
|
|
||||||
self.dropout_rate = dropout_rate
|
|
||||||
self.initializer_factor = initializer_factor
|
|
||||||
self.eos_token_id = eos_token_id
|
|
||||||
self.pad_token_id = pad_token_id
|
|
||||||
self.scope = scope
|
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
|
||||||
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
|
||||||
|
|
||||||
input_mask = None
|
|
||||||
if self.use_input_mask:
|
|
||||||
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
|
|
||||||
|
|
||||||
token_labels = None
|
|
||||||
if self.use_labels:
|
|
||||||
token_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
|
||||||
|
|
||||||
config = T5Config(
|
|
||||||
vocab_size=self.vocab_size,
|
|
||||||
n_positions=self.n_positions,
|
|
||||||
d_model=self.hidden_size,
|
|
||||||
d_ff=self.d_ff,
|
|
||||||
d_kv=self.hidden_size // self.num_attention_heads,
|
|
||||||
num_layers=self.num_hidden_layers,
|
|
||||||
num_heads=self.num_attention_heads,
|
|
||||||
relative_attention_num_buckets=self.relative_attention_num_buckets,
|
|
||||||
dropout_rate=self.dropout_rate,
|
|
||||||
initializer_factor=self.initializer_factor,
|
|
||||||
eos_token_id=self.eos_token_id,
|
|
||||||
bos_token_id=self.pad_token_id,
|
|
||||||
pad_token_id=self.pad_token_id,
|
|
||||||
)
|
|
||||||
|
|
||||||
return (config, input_ids, input_mask, token_labels)
|
|
||||||
|
|
||||||
def create_and_check_t5_model(self, config, input_ids, input_mask, token_labels):
|
|
||||||
model = TFT5Model(config=config)
|
|
||||||
inputs = {
|
|
||||||
"inputs": input_ids,
|
|
||||||
"decoder_input_ids": input_ids,
|
|
||||||
"decoder_attention_mask": input_mask,
|
|
||||||
}
|
|
||||||
decoder_output, decoder_past, encoder_output = model(inputs)
|
|
||||||
|
|
||||||
decoder_output, decoder_past, encoder_output = model(
|
|
||||||
input_ids, decoder_attention_mask=input_mask, decoder_input_ids=input_ids
|
|
||||||
)
|
|
||||||
result = {
|
|
||||||
"encoder_output": encoder_output.numpy(),
|
|
||||||
"decoder_past": decoder_past,
|
|
||||||
"decoder_output": decoder_output.numpy(),
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["encoder_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["decoder_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
|
|
||||||
)
|
|
||||||
self.parent.assertEqual(len(decoder_past), 2)
|
|
||||||
# decoder_past[0] should correspond to encoder output
|
|
||||||
self.parent.assertTrue(tf.reduce_all(tf.math.equal(decoder_past[0][0], encoder_output)))
|
|
||||||
# There should be `num_layers` key value embeddings stored in decoder_past[1]
|
|
||||||
self.parent.assertEqual(len(decoder_past[1]), config.num_layers)
|
|
||||||
# There should be a self attn key, a self attn value, a cross attn key and a cross attn value stored in each decoder_past[1] tuple
|
|
||||||
self.parent.assertEqual(len(decoder_past[1][0]), 4)
|
|
||||||
|
|
||||||
def create_and_check_t5_with_lm_head(self, config, input_ids, input_mask, token_labels):
|
|
||||||
model = TFT5ForConditionalGeneration(config=config)
|
|
||||||
inputs_dict = {
|
|
||||||
"inputs": input_ids,
|
|
||||||
"decoder_input_ids": input_ids,
|
|
||||||
"decoder_attention_mask": input_mask,
|
|
||||||
}
|
|
||||||
|
|
||||||
prediction_scores, _, _ = model(inputs_dict)
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"prediction_scores": prediction_scores.numpy(),
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_t5_decoder_model_past(self, config, input_ids, decoder_input_ids, attention_mask):
|
|
||||||
model = TFT5Model(config=config).get_decoder()
|
|
||||||
|
|
||||||
input_ids = input_ids[:1, :]
|
|
||||||
self.batch_size = 1
|
|
||||||
|
|
||||||
# first forward pass
|
|
||||||
_, past_key_value_states = model(input_ids, use_cache=True)
|
|
||||||
|
|
||||||
# create hypothetical next token and extent to next_input_ids
|
|
||||||
next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
|
|
||||||
|
|
||||||
# append to next input_ids and
|
|
||||||
next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
|
|
||||||
|
|
||||||
output_from_no_past = model(next_input_ids)[0]
|
|
||||||
output_from_past = model(next_tokens, past_key_value_states=past_key_value_states)[0]
|
|
||||||
|
|
||||||
# select random slice
|
|
||||||
random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
|
|
||||||
output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
|
|
||||||
output_from_past_slice = output_from_past[:, 0, random_slice_idx]
|
|
||||||
|
|
||||||
# test that outputs are equal for slice
|
|
||||||
tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
|
|
||||||
|
|
||||||
def create_and_check_t5_decoder_model_attention_mask_past(
|
|
||||||
self, config, input_ids, decoder_input_ids, attention_mask
|
|
||||||
):
|
|
||||||
model = TFT5Model(config=config).get_decoder()
|
|
||||||
|
|
||||||
# create attention mask
|
|
||||||
half_seq_length = self.seq_length // 2
|
|
||||||
attn_mask_begin = tf.ones((self.batch_size, half_seq_length), dtype=tf.int32)
|
|
||||||
attn_mask_end = tf.zeros((self.batch_size, self.seq_length - half_seq_length), dtype=tf.int32)
|
|
||||||
attn_mask = tf.concat([attn_mask_begin, attn_mask_end], axis=1)
|
|
||||||
|
|
||||||
# first forward pass
|
|
||||||
_, past_key_value_states = model(input_ids, attention_mask=attn_mask, use_cache=True)
|
|
||||||
|
|
||||||
# create hypothetical next token and extent to next_input_ids
|
|
||||||
next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
|
|
||||||
|
|
||||||
# change a random masked slice from input_ids
|
|
||||||
random_seq_idx_to_change = ids_tensor((1,), half_seq_length).numpy() + 1
|
|
||||||
random_other_next_tokens = ids_tensor((self.batch_size, self.seq_length), config.vocab_size)
|
|
||||||
vector_condition = tf.range(self.seq_length) == (self.seq_length - random_seq_idx_to_change)
|
|
||||||
condition = tf.transpose(
|
|
||||||
tf.broadcast_to(tf.expand_dims(vector_condition, -1), (self.seq_length, self.batch_size))
|
|
||||||
)
|
|
||||||
input_ids = tf.where(condition, random_other_next_tokens, input_ids)
|
|
||||||
|
|
||||||
# append to next input_ids and attn_mask
|
|
||||||
next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
|
|
||||||
attn_mask = tf.concat([attn_mask, tf.ones((attn_mask.shape[0], 1), dtype=tf.int32)], axis=1,)
|
|
||||||
|
|
||||||
# get two different outputs
|
|
||||||
output_from_no_past = model(next_input_ids, attention_mask=attn_mask)[0]
|
|
||||||
output_from_past = model(
|
|
||||||
next_tokens, past_key_value_states=past_key_value_states, attention_mask=attn_mask
|
|
||||||
)[0]
|
|
||||||
|
|
||||||
# select random slice
|
|
||||||
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).numpy().item()
|
|
||||||
output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
|
|
||||||
output_from_past_slice = output_from_past[:, 0, random_slice_idx]
|
|
||||||
|
|
||||||
# test that outputs are equal for slice
|
|
||||||
tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
|
|
||||||
|
|
||||||
def prepare_config_and_inputs_for_common(self):
|
|
||||||
config_and_inputs = self.prepare_config_and_inputs()
|
|
||||||
(config, input_ids, input_mask, token_labels) = config_and_inputs
|
|
||||||
inputs_dict = {
|
|
||||||
"inputs": input_ids,
|
|
||||||
"decoder_input_ids": input_ids,
|
|
||||||
"decoder_attention_mask": input_mask,
|
|
||||||
"use_cache": tf.convert_to_tensor([False]),
|
|
||||||
}
|
|
||||||
return config, inputs_dict
|
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = TFT5ModelTest.TFT5ModelTester(self)
|
self.model_tester = TFT5ModelTester(self)
|
||||||
self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
|
self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
|
||||||
|
|
||||||
def test_config(self):
|
def test_config(self):
|
||||||
|
|
|
@ -33,6 +33,135 @@ if is_tf_available():
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TFTransfoXLModelTester:
|
||||||
|
def __init__(
|
||||||
|
self, parent,
|
||||||
|
):
|
||||||
|
self.parent = parent
|
||||||
|
self.batch_size = 13
|
||||||
|
self.seq_length = 7
|
||||||
|
self.mem_len = 30
|
||||||
|
self.key_length = self.seq_length + self.mem_len
|
||||||
|
self.clamp_len = 15
|
||||||
|
self.is_training = True
|
||||||
|
self.use_labels = True
|
||||||
|
self.vocab_size = 99
|
||||||
|
self.cutoffs = [10, 50, 80]
|
||||||
|
self.hidden_size = 32
|
||||||
|
self.d_embed = 32
|
||||||
|
self.num_attention_heads = 4
|
||||||
|
self.d_head = 8
|
||||||
|
self.d_inner = 128
|
||||||
|
self.div_val = 2
|
||||||
|
self.num_hidden_layers = 5
|
||||||
|
self.scope = None
|
||||||
|
self.seed = 1
|
||||||
|
self.eos_token_id = 0
|
||||||
|
|
||||||
|
def prepare_config_and_inputs(self):
|
||||||
|
input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
input_ids_2 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
|
||||||
|
lm_labels = None
|
||||||
|
if self.use_labels:
|
||||||
|
lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
|
||||||
|
config = TransfoXLConfig(
|
||||||
|
vocab_size=self.vocab_size,
|
||||||
|
mem_len=self.mem_len,
|
||||||
|
clamp_len=self.clamp_len,
|
||||||
|
cutoffs=self.cutoffs,
|
||||||
|
d_model=self.hidden_size,
|
||||||
|
d_embed=self.d_embed,
|
||||||
|
n_head=self.num_attention_heads,
|
||||||
|
d_head=self.d_head,
|
||||||
|
d_inner=self.d_inner,
|
||||||
|
div_val=self.div_val,
|
||||||
|
n_layer=self.num_hidden_layers,
|
||||||
|
eos_token_id=self.eos_token_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
return (config, input_ids_1, input_ids_2, lm_labels)
|
||||||
|
|
||||||
|
def set_seed(self):
|
||||||
|
random.seed(self.seed)
|
||||||
|
tf.random.set_seed(self.seed)
|
||||||
|
|
||||||
|
def create_and_check_transfo_xl_model(self, config, input_ids_1, input_ids_2, lm_labels):
|
||||||
|
model = TFTransfoXLModel(config)
|
||||||
|
|
||||||
|
hidden_states_1, mems_1 = model(input_ids_1)
|
||||||
|
|
||||||
|
inputs = {"input_ids": input_ids_2, "mems": mems_1}
|
||||||
|
|
||||||
|
hidden_states_2, mems_2 = model(inputs)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"hidden_states_1": hidden_states_1.numpy(),
|
||||||
|
"mems_1": [mem.numpy() for mem in mems_1],
|
||||||
|
"hidden_states_2": hidden_states_2.numpy(),
|
||||||
|
"mems_2": [mem.numpy() for mem in mems_2],
|
||||||
|
}
|
||||||
|
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["hidden_states_1"].shape), [self.batch_size, self.seq_length, self.hidden_size]
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["hidden_states_2"].shape), [self.batch_size, self.seq_length, self.hidden_size]
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(list(mem.shape) for mem in result["mems_1"]),
|
||||||
|
[[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(list(mem.shape) for mem in result["mems_2"]),
|
||||||
|
[[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_and_check_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels):
|
||||||
|
model = TFTransfoXLLMHeadModel(config)
|
||||||
|
|
||||||
|
lm_logits_1, mems_1 = model(input_ids_1)
|
||||||
|
|
||||||
|
inputs = {"input_ids": input_ids_1, "labels": lm_labels}
|
||||||
|
_, mems_1 = model(inputs)
|
||||||
|
|
||||||
|
lm_logits_2, mems_2 = model([input_ids_2, mems_1])
|
||||||
|
|
||||||
|
inputs = {"input_ids": input_ids_1, "mems": mems_1, "labels": lm_labels}
|
||||||
|
|
||||||
|
_, mems_2 = model(inputs)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"mems_1": [mem.numpy() for mem in mems_1],
|
||||||
|
"lm_logits_1": lm_logits_1.numpy(),
|
||||||
|
"mems_2": [mem.numpy() for mem in mems_2],
|
||||||
|
"lm_logits_2": lm_logits_2.numpy(),
|
||||||
|
}
|
||||||
|
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["lm_logits_1"].shape), [self.batch_size, self.seq_length, self.vocab_size]
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(list(mem.shape) for mem in result["mems_1"]),
|
||||||
|
[[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["lm_logits_2"].shape), [self.batch_size, self.seq_length, self.vocab_size]
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(list(mem.shape) for mem in result["mems_2"]),
|
||||||
|
[[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
||||||
|
)
|
||||||
|
|
||||||
|
def prepare_config_and_inputs_for_common(self):
|
||||||
|
config_and_inputs = self.prepare_config_and_inputs()
|
||||||
|
(config, input_ids_1, input_ids_2, lm_labels) = config_and_inputs
|
||||||
|
inputs_dict = {"input_ids": input_ids_1}
|
||||||
|
return config, inputs_dict
|
||||||
|
|
||||||
|
|
||||||
@require_tf
|
@require_tf
|
||||||
class TFTransfoXLModelTest(TFModelTesterMixin, unittest.TestCase):
|
class TFTransfoXLModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||||
|
|
||||||
|
@ -43,155 +172,8 @@ class TFTransfoXLModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||||
test_torchscript = False
|
test_torchscript = False
|
||||||
test_resize_embeddings = False
|
test_resize_embeddings = False
|
||||||
|
|
||||||
class TFTransfoXLModelTester(object):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
parent,
|
|
||||||
batch_size=13,
|
|
||||||
seq_length=7,
|
|
||||||
mem_len=30,
|
|
||||||
clamp_len=15,
|
|
||||||
is_training=True,
|
|
||||||
use_labels=True,
|
|
||||||
vocab_size=99,
|
|
||||||
cutoffs=[10, 50, 80],
|
|
||||||
hidden_size=32,
|
|
||||||
d_embed=32,
|
|
||||||
num_attention_heads=4,
|
|
||||||
d_head=8,
|
|
||||||
d_inner=128,
|
|
||||||
div_val=2,
|
|
||||||
num_hidden_layers=5,
|
|
||||||
scope=None,
|
|
||||||
seed=1,
|
|
||||||
eos_token_id=0,
|
|
||||||
):
|
|
||||||
self.parent = parent
|
|
||||||
self.batch_size = batch_size
|
|
||||||
self.seq_length = seq_length
|
|
||||||
self.mem_len = mem_len
|
|
||||||
self.key_length = seq_length + mem_len
|
|
||||||
self.clamp_len = clamp_len
|
|
||||||
self.is_training = is_training
|
|
||||||
self.use_labels = use_labels
|
|
||||||
self.vocab_size = vocab_size
|
|
||||||
self.cutoffs = cutoffs
|
|
||||||
self.hidden_size = hidden_size
|
|
||||||
self.d_embed = d_embed
|
|
||||||
self.num_attention_heads = num_attention_heads
|
|
||||||
self.d_head = d_head
|
|
||||||
self.d_inner = d_inner
|
|
||||||
self.div_val = div_val
|
|
||||||
self.num_hidden_layers = num_hidden_layers
|
|
||||||
self.scope = scope
|
|
||||||
self.seed = seed
|
|
||||||
self.eos_token_id = eos_token_id
|
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
|
||||||
input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
|
||||||
input_ids_2 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
|
||||||
|
|
||||||
lm_labels = None
|
|
||||||
if self.use_labels:
|
|
||||||
lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
|
||||||
|
|
||||||
config = TransfoXLConfig(
|
|
||||||
vocab_size=self.vocab_size,
|
|
||||||
mem_len=self.mem_len,
|
|
||||||
clamp_len=self.clamp_len,
|
|
||||||
cutoffs=self.cutoffs,
|
|
||||||
d_model=self.hidden_size,
|
|
||||||
d_embed=self.d_embed,
|
|
||||||
n_head=self.num_attention_heads,
|
|
||||||
d_head=self.d_head,
|
|
||||||
d_inner=self.d_inner,
|
|
||||||
div_val=self.div_val,
|
|
||||||
n_layer=self.num_hidden_layers,
|
|
||||||
eos_token_id=self.eos_token_id,
|
|
||||||
)
|
|
||||||
|
|
||||||
return (config, input_ids_1, input_ids_2, lm_labels)
|
|
||||||
|
|
||||||
def set_seed(self):
|
|
||||||
random.seed(self.seed)
|
|
||||||
tf.random.set_seed(self.seed)
|
|
||||||
|
|
||||||
def create_and_check_transfo_xl_model(self, config, input_ids_1, input_ids_2, lm_labels):
|
|
||||||
model = TFTransfoXLModel(config)
|
|
||||||
|
|
||||||
hidden_states_1, mems_1 = model(input_ids_1)
|
|
||||||
|
|
||||||
inputs = {"input_ids": input_ids_2, "mems": mems_1}
|
|
||||||
|
|
||||||
hidden_states_2, mems_2 = model(inputs)
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"hidden_states_1": hidden_states_1.numpy(),
|
|
||||||
"mems_1": [mem.numpy() for mem in mems_1],
|
|
||||||
"hidden_states_2": hidden_states_2.numpy(),
|
|
||||||
"mems_2": [mem.numpy() for mem in mems_2],
|
|
||||||
}
|
|
||||||
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["hidden_states_1"].shape), [self.batch_size, self.seq_length, self.hidden_size]
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["hidden_states_2"].shape), [self.batch_size, self.seq_length, self.hidden_size]
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(list(mem.shape) for mem in result["mems_1"]),
|
|
||||||
[[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(list(mem.shape) for mem in result["mems_2"]),
|
|
||||||
[[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels):
|
|
||||||
model = TFTransfoXLLMHeadModel(config)
|
|
||||||
|
|
||||||
lm_logits_1, mems_1 = model(input_ids_1)
|
|
||||||
|
|
||||||
inputs = {"input_ids": input_ids_1, "labels": lm_labels}
|
|
||||||
_, mems_1 = model(inputs)
|
|
||||||
|
|
||||||
lm_logits_2, mems_2 = model([input_ids_2, mems_1])
|
|
||||||
|
|
||||||
inputs = {"input_ids": input_ids_1, "mems": mems_1, "labels": lm_labels}
|
|
||||||
|
|
||||||
_, mems_2 = model(inputs)
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"mems_1": [mem.numpy() for mem in mems_1],
|
|
||||||
"lm_logits_1": lm_logits_1.numpy(),
|
|
||||||
"mems_2": [mem.numpy() for mem in mems_2],
|
|
||||||
"lm_logits_2": lm_logits_2.numpy(),
|
|
||||||
}
|
|
||||||
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["lm_logits_1"].shape), [self.batch_size, self.seq_length, self.vocab_size]
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(list(mem.shape) for mem in result["mems_1"]),
|
|
||||||
[[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
|
||||||
)
|
|
||||||
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["lm_logits_2"].shape), [self.batch_size, self.seq_length, self.vocab_size]
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(list(mem.shape) for mem in result["mems_2"]),
|
|
||||||
[[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
|
||||||
)
|
|
||||||
|
|
||||||
def prepare_config_and_inputs_for_common(self):
|
|
||||||
config_and_inputs = self.prepare_config_and_inputs()
|
|
||||||
(config, input_ids_1, input_ids_2, lm_labels) = config_and_inputs
|
|
||||||
inputs_dict = {"input_ids": input_ids_1}
|
|
||||||
return config, inputs_dict
|
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = TFTransfoXLModelTest.TFTransfoXLModelTester(self)
|
self.model_tester = TFTransfoXLModelTester(self)
|
||||||
self.config_tester = ConfigTester(self, config_class=TransfoXLConfig, d_embed=37)
|
self.config_tester = ConfigTester(self, config_class=TransfoXLConfig, d_embed=37)
|
||||||
|
|
||||||
def test_config(self):
|
def test_config(self):
|
||||||
|
|
|
@ -35,6 +35,211 @@ if is_tf_available():
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TFXLMModelTester:
|
||||||
|
def __init__(
|
||||||
|
self, parent,
|
||||||
|
):
|
||||||
|
self.parent = parent
|
||||||
|
self.batch_size = 13
|
||||||
|
self.seq_length = 7
|
||||||
|
self.is_training = True
|
||||||
|
self.use_input_lengths = True
|
||||||
|
self.use_token_type_ids = True
|
||||||
|
self.use_labels = True
|
||||||
|
self.gelu_activation = True
|
||||||
|
self.sinusoidal_embeddings = False
|
||||||
|
self.causal = False
|
||||||
|
self.asm = False
|
||||||
|
self.n_langs = 2
|
||||||
|
self.vocab_size = 99
|
||||||
|
self.n_special = 0
|
||||||
|
self.hidden_size = 32
|
||||||
|
self.num_hidden_layers = 5
|
||||||
|
self.num_attention_heads = 4
|
||||||
|
self.hidden_dropout_prob = 0.1
|
||||||
|
self.attention_probs_dropout_prob = 0.1
|
||||||
|
self.max_position_embeddings = 512
|
||||||
|
self.type_vocab_size = 16
|
||||||
|
self.type_sequence_label_size = 2
|
||||||
|
self.initializer_range = 0.02
|
||||||
|
self.num_labels = 3
|
||||||
|
self.num_choices = 4
|
||||||
|
self.summary_type = "last"
|
||||||
|
self.use_proj = True
|
||||||
|
self.scope = None
|
||||||
|
self.bos_token_id = 0
|
||||||
|
|
||||||
|
def prepare_config_and_inputs(self):
|
||||||
|
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
input_mask = ids_tensor([self.batch_size, self.seq_length], 2, dtype=tf.float32)
|
||||||
|
|
||||||
|
input_lengths = None
|
||||||
|
if self.use_input_lengths:
|
||||||
|
input_lengths = (
|
||||||
|
ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2
|
||||||
|
) # small variation of seq_length
|
||||||
|
|
||||||
|
token_type_ids = None
|
||||||
|
if self.use_token_type_ids:
|
||||||
|
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.n_langs)
|
||||||
|
|
||||||
|
sequence_labels = None
|
||||||
|
token_labels = None
|
||||||
|
is_impossible_labels = None
|
||||||
|
if self.use_labels:
|
||||||
|
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||||
|
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
||||||
|
is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
|
||||||
|
|
||||||
|
config = XLMConfig(
|
||||||
|
vocab_size=self.vocab_size,
|
||||||
|
n_special=self.n_special,
|
||||||
|
emb_dim=self.hidden_size,
|
||||||
|
n_layers=self.num_hidden_layers,
|
||||||
|
n_heads=self.num_attention_heads,
|
||||||
|
dropout=self.hidden_dropout_prob,
|
||||||
|
attention_dropout=self.attention_probs_dropout_prob,
|
||||||
|
gelu_activation=self.gelu_activation,
|
||||||
|
sinusoidal_embeddings=self.sinusoidal_embeddings,
|
||||||
|
asm=self.asm,
|
||||||
|
causal=self.causal,
|
||||||
|
n_langs=self.n_langs,
|
||||||
|
max_position_embeddings=self.max_position_embeddings,
|
||||||
|
initializer_range=self.initializer_range,
|
||||||
|
summary_type=self.summary_type,
|
||||||
|
use_proj=self.use_proj,
|
||||||
|
bos_token_id=self.bos_token_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
return (
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
token_type_ids,
|
||||||
|
input_lengths,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
is_impossible_labels,
|
||||||
|
input_mask,
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_and_check_xlm_model(
|
||||||
|
self,
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
token_type_ids,
|
||||||
|
input_lengths,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
is_impossible_labels,
|
||||||
|
input_mask,
|
||||||
|
):
|
||||||
|
model = TFXLMModel(config=config)
|
||||||
|
inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids}
|
||||||
|
outputs = model(inputs)
|
||||||
|
|
||||||
|
inputs = [input_ids, input_mask]
|
||||||
|
outputs = model(inputs)
|
||||||
|
sequence_output = outputs[0]
|
||||||
|
result = {
|
||||||
|
"sequence_output": sequence_output.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_and_check_xlm_lm_head(
|
||||||
|
self,
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
token_type_ids,
|
||||||
|
input_lengths,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
is_impossible_labels,
|
||||||
|
input_mask,
|
||||||
|
):
|
||||||
|
model = TFXLMWithLMHeadModel(config)
|
||||||
|
|
||||||
|
inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids}
|
||||||
|
outputs = model(inputs)
|
||||||
|
|
||||||
|
logits = outputs[0]
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"logits": logits.numpy(),
|
||||||
|
}
|
||||||
|
|
||||||
|
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size])
|
||||||
|
|
||||||
|
def create_and_check_xlm_qa(
|
||||||
|
self,
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
token_type_ids,
|
||||||
|
input_lengths,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
is_impossible_labels,
|
||||||
|
input_mask,
|
||||||
|
):
|
||||||
|
model = TFXLMForQuestionAnsweringSimple(config)
|
||||||
|
|
||||||
|
inputs = {"input_ids": input_ids, "lengths": input_lengths}
|
||||||
|
|
||||||
|
start_logits, end_logits = model(inputs)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"start_logits": start_logits.numpy(),
|
||||||
|
"end_logits": end_logits.numpy(),
|
||||||
|
}
|
||||||
|
|
||||||
|
self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
|
||||||
|
self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
|
||||||
|
|
||||||
|
def create_and_check_xlm_sequence_classif(
|
||||||
|
self,
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
token_type_ids,
|
||||||
|
input_lengths,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
is_impossible_labels,
|
||||||
|
input_mask,
|
||||||
|
):
|
||||||
|
model = TFXLMForSequenceClassification(config)
|
||||||
|
|
||||||
|
inputs = {"input_ids": input_ids, "lengths": input_lengths}
|
||||||
|
|
||||||
|
(logits,) = model(inputs)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"logits": logits.numpy(),
|
||||||
|
}
|
||||||
|
|
||||||
|
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.type_sequence_label_size])
|
||||||
|
|
||||||
|
def prepare_config_and_inputs_for_common(self):
|
||||||
|
config_and_inputs = self.prepare_config_and_inputs()
|
||||||
|
(
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
token_type_ids,
|
||||||
|
input_lengths,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
is_impossible_labels,
|
||||||
|
input_mask,
|
||||||
|
) = config_and_inputs
|
||||||
|
inputs_dict = {
|
||||||
|
"input_ids": input_ids,
|
||||||
|
"token_type_ids": token_type_ids,
|
||||||
|
"langs": token_type_ids,
|
||||||
|
"lengths": input_lengths,
|
||||||
|
}
|
||||||
|
return config, inputs_dict
|
||||||
|
|
||||||
|
|
||||||
@require_tf
|
@require_tf
|
||||||
class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase):
|
class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||||
|
|
||||||
|
@ -47,244 +252,8 @@ class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||||
(TFXLMWithLMHeadModel,) if is_tf_available() else ()
|
(TFXLMWithLMHeadModel,) if is_tf_available() else ()
|
||||||
) # TODO (PVP): Check other models whether language generation is also applicable
|
) # TODO (PVP): Check other models whether language generation is also applicable
|
||||||
|
|
||||||
class TFXLMModelTester(object):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
parent,
|
|
||||||
batch_size=13,
|
|
||||||
seq_length=7,
|
|
||||||
is_training=True,
|
|
||||||
use_input_lengths=True,
|
|
||||||
use_token_type_ids=True,
|
|
||||||
use_labels=True,
|
|
||||||
gelu_activation=True,
|
|
||||||
sinusoidal_embeddings=False,
|
|
||||||
causal=False,
|
|
||||||
asm=False,
|
|
||||||
n_langs=2,
|
|
||||||
vocab_size=99,
|
|
||||||
n_special=0,
|
|
||||||
hidden_size=32,
|
|
||||||
num_hidden_layers=5,
|
|
||||||
num_attention_heads=4,
|
|
||||||
hidden_dropout_prob=0.1,
|
|
||||||
attention_probs_dropout_prob=0.1,
|
|
||||||
max_position_embeddings=512,
|
|
||||||
type_vocab_size=16,
|
|
||||||
type_sequence_label_size=2,
|
|
||||||
initializer_range=0.02,
|
|
||||||
num_labels=3,
|
|
||||||
num_choices=4,
|
|
||||||
summary_type="last",
|
|
||||||
use_proj=True,
|
|
||||||
scope=None,
|
|
||||||
bos_token_id=0,
|
|
||||||
):
|
|
||||||
self.parent = parent
|
|
||||||
self.batch_size = batch_size
|
|
||||||
self.seq_length = seq_length
|
|
||||||
self.is_training = is_training
|
|
||||||
self.use_input_lengths = use_input_lengths
|
|
||||||
self.use_token_type_ids = use_token_type_ids
|
|
||||||
self.use_labels = use_labels
|
|
||||||
self.gelu_activation = gelu_activation
|
|
||||||
self.sinusoidal_embeddings = sinusoidal_embeddings
|
|
||||||
self.asm = asm
|
|
||||||
self.n_langs = n_langs
|
|
||||||
self.vocab_size = vocab_size
|
|
||||||
self.n_special = n_special
|
|
||||||
self.summary_type = summary_type
|
|
||||||
self.causal = causal
|
|
||||||
self.use_proj = use_proj
|
|
||||||
self.hidden_size = hidden_size
|
|
||||||
self.num_hidden_layers = num_hidden_layers
|
|
||||||
self.num_attention_heads = num_attention_heads
|
|
||||||
self.hidden_dropout_prob = hidden_dropout_prob
|
|
||||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
|
||||||
self.max_position_embeddings = max_position_embeddings
|
|
||||||
self.n_langs = n_langs
|
|
||||||
self.type_sequence_label_size = type_sequence_label_size
|
|
||||||
self.initializer_range = initializer_range
|
|
||||||
self.summary_type = summary_type
|
|
||||||
self.num_labels = num_labels
|
|
||||||
self.num_choices = num_choices
|
|
||||||
self.scope = scope
|
|
||||||
self.bos_token_id = bos_token_id
|
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
|
||||||
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
|
||||||
input_mask = ids_tensor([self.batch_size, self.seq_length], 2, dtype=tf.float32)
|
|
||||||
|
|
||||||
input_lengths = None
|
|
||||||
if self.use_input_lengths:
|
|
||||||
input_lengths = (
|
|
||||||
ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2
|
|
||||||
) # small variation of seq_length
|
|
||||||
|
|
||||||
token_type_ids = None
|
|
||||||
if self.use_token_type_ids:
|
|
||||||
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.n_langs)
|
|
||||||
|
|
||||||
sequence_labels = None
|
|
||||||
token_labels = None
|
|
||||||
is_impossible_labels = None
|
|
||||||
if self.use_labels:
|
|
||||||
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
|
||||||
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
|
||||||
is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
|
|
||||||
|
|
||||||
config = XLMConfig(
|
|
||||||
vocab_size=self.vocab_size,
|
|
||||||
n_special=self.n_special,
|
|
||||||
emb_dim=self.hidden_size,
|
|
||||||
n_layers=self.num_hidden_layers,
|
|
||||||
n_heads=self.num_attention_heads,
|
|
||||||
dropout=self.hidden_dropout_prob,
|
|
||||||
attention_dropout=self.attention_probs_dropout_prob,
|
|
||||||
gelu_activation=self.gelu_activation,
|
|
||||||
sinusoidal_embeddings=self.sinusoidal_embeddings,
|
|
||||||
asm=self.asm,
|
|
||||||
causal=self.causal,
|
|
||||||
n_langs=self.n_langs,
|
|
||||||
max_position_embeddings=self.max_position_embeddings,
|
|
||||||
initializer_range=self.initializer_range,
|
|
||||||
summary_type=self.summary_type,
|
|
||||||
use_proj=self.use_proj,
|
|
||||||
bos_token_id=self.bos_token_id,
|
|
||||||
)
|
|
||||||
|
|
||||||
return (
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
token_type_ids,
|
|
||||||
input_lengths,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
is_impossible_labels,
|
|
||||||
input_mask,
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_xlm_model(
|
|
||||||
self,
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
token_type_ids,
|
|
||||||
input_lengths,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
is_impossible_labels,
|
|
||||||
input_mask,
|
|
||||||
):
|
|
||||||
model = TFXLMModel(config=config)
|
|
||||||
inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids}
|
|
||||||
outputs = model(inputs)
|
|
||||||
|
|
||||||
inputs = [input_ids, input_mask]
|
|
||||||
outputs = model(inputs)
|
|
||||||
sequence_output = outputs[0]
|
|
||||||
result = {
|
|
||||||
"sequence_output": sequence_output.numpy(),
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_xlm_lm_head(
|
|
||||||
self,
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
token_type_ids,
|
|
||||||
input_lengths,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
is_impossible_labels,
|
|
||||||
input_mask,
|
|
||||||
):
|
|
||||||
model = TFXLMWithLMHeadModel(config)
|
|
||||||
|
|
||||||
inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids}
|
|
||||||
outputs = model(inputs)
|
|
||||||
|
|
||||||
logits = outputs[0]
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"logits": logits.numpy(),
|
|
||||||
}
|
|
||||||
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size]
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_xlm_qa(
|
|
||||||
self,
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
token_type_ids,
|
|
||||||
input_lengths,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
is_impossible_labels,
|
|
||||||
input_mask,
|
|
||||||
):
|
|
||||||
model = TFXLMForQuestionAnsweringSimple(config)
|
|
||||||
|
|
||||||
inputs = {"input_ids": input_ids, "lengths": input_lengths}
|
|
||||||
|
|
||||||
start_logits, end_logits = model(inputs)
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"start_logits": start_logits.numpy(),
|
|
||||||
"end_logits": end_logits.numpy(),
|
|
||||||
}
|
|
||||||
|
|
||||||
self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
|
|
||||||
self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
|
|
||||||
|
|
||||||
def create_and_check_xlm_sequence_classif(
|
|
||||||
self,
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
token_type_ids,
|
|
||||||
input_lengths,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
is_impossible_labels,
|
|
||||||
input_mask,
|
|
||||||
):
|
|
||||||
model = TFXLMForSequenceClassification(config)
|
|
||||||
|
|
||||||
inputs = {"input_ids": input_ids, "lengths": input_lengths}
|
|
||||||
|
|
||||||
(logits,) = model(inputs)
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"logits": logits.numpy(),
|
|
||||||
}
|
|
||||||
|
|
||||||
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.type_sequence_label_size])
|
|
||||||
|
|
||||||
def prepare_config_and_inputs_for_common(self):
|
|
||||||
config_and_inputs = self.prepare_config_and_inputs()
|
|
||||||
(
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
token_type_ids,
|
|
||||||
input_lengths,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
is_impossible_labels,
|
|
||||||
input_mask,
|
|
||||||
) = config_and_inputs
|
|
||||||
inputs_dict = {
|
|
||||||
"input_ids": input_ids,
|
|
||||||
"token_type_ids": token_type_ids,
|
|
||||||
"langs": token_type_ids,
|
|
||||||
"lengths": input_lengths,
|
|
||||||
}
|
|
||||||
return config, inputs_dict
|
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = TFXLMModelTest.TFXLMModelTester(self)
|
self.model_tester = TFXLMModelTester(self)
|
||||||
self.config_tester = ConfigTester(self, config_class=XLMConfig, emb_dim=37)
|
self.config_tester = ConfigTester(self, config_class=XLMConfig, emb_dim=37)
|
||||||
|
|
||||||
def test_config(self):
|
def test_config(self):
|
||||||
|
|
|
@ -37,6 +37,304 @@ if is_tf_available():
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TFXLNetModelTester:
|
||||||
|
def __init__(
|
||||||
|
self, parent,
|
||||||
|
):
|
||||||
|
self.parent = parent
|
||||||
|
self.batch_size = 13
|
||||||
|
self.seq_length = 7
|
||||||
|
self.mem_len = 10
|
||||||
|
# self.key_len = seq_length + mem_len
|
||||||
|
self.clamp_len = -1
|
||||||
|
self.reuse_len = 15
|
||||||
|
self.is_training = True
|
||||||
|
self.use_labels = True
|
||||||
|
self.vocab_size = 99
|
||||||
|
self.cutoffs = [10, 50, 80]
|
||||||
|
self.hidden_size = 32
|
||||||
|
self.num_attention_heads = 4
|
||||||
|
self.d_inner = 128
|
||||||
|
self.num_hidden_layers = 5
|
||||||
|
self.type_sequence_label_size = 2
|
||||||
|
self.untie_r = True
|
||||||
|
self.bi_data = False
|
||||||
|
self.same_length = False
|
||||||
|
self.initializer_range = 0.05
|
||||||
|
self.seed = 1
|
||||||
|
self.type_vocab_size = 2
|
||||||
|
self.bos_token_id = 1
|
||||||
|
self.eos_token_id = 2
|
||||||
|
self.pad_token_id = 5
|
||||||
|
|
||||||
|
def prepare_config_and_inputs(self):
|
||||||
|
input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
input_ids_2 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
segment_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
||||||
|
input_mask = ids_tensor([self.batch_size, self.seq_length], 2, dtype=tf.float32)
|
||||||
|
|
||||||
|
input_ids_q = ids_tensor([self.batch_size, self.seq_length + 1], self.vocab_size)
|
||||||
|
perm_mask = tf.zeros((self.batch_size, self.seq_length + 1, self.seq_length), dtype=tf.float32)
|
||||||
|
perm_mask_last = tf.ones((self.batch_size, self.seq_length + 1, 1), dtype=tf.float32)
|
||||||
|
perm_mask = tf.concat([perm_mask, perm_mask_last], axis=-1)
|
||||||
|
# perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token
|
||||||
|
target_mapping = tf.zeros((self.batch_size, 1, self.seq_length), dtype=tf.float32)
|
||||||
|
target_mapping_last = tf.ones((self.batch_size, 1, 1), dtype=tf.float32)
|
||||||
|
target_mapping = tf.concat([target_mapping, target_mapping_last], axis=-1)
|
||||||
|
# target_mapping[:, 0, -1] = 1.0 # predict last token
|
||||||
|
|
||||||
|
sequence_labels = None
|
||||||
|
lm_labels = None
|
||||||
|
is_impossible_labels = None
|
||||||
|
if self.use_labels:
|
||||||
|
lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||||
|
is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
|
||||||
|
|
||||||
|
config = XLNetConfig(
|
||||||
|
vocab_size=self.vocab_size,
|
||||||
|
d_model=self.hidden_size,
|
||||||
|
n_head=self.num_attention_heads,
|
||||||
|
d_inner=self.d_inner,
|
||||||
|
n_layer=self.num_hidden_layers,
|
||||||
|
untie_r=self.untie_r,
|
||||||
|
mem_len=self.mem_len,
|
||||||
|
clamp_len=self.clamp_len,
|
||||||
|
same_length=self.same_length,
|
||||||
|
reuse_len=self.reuse_len,
|
||||||
|
bi_data=self.bi_data,
|
||||||
|
initializer_range=self.initializer_range,
|
||||||
|
num_labels=self.type_sequence_label_size,
|
||||||
|
bos_token_id=self.bos_token_id,
|
||||||
|
pad_token_id=self.pad_token_id,
|
||||||
|
eos_token_id=self.eos_token_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
return (
|
||||||
|
config,
|
||||||
|
input_ids_1,
|
||||||
|
input_ids_2,
|
||||||
|
input_ids_q,
|
||||||
|
perm_mask,
|
||||||
|
input_mask,
|
||||||
|
target_mapping,
|
||||||
|
segment_ids,
|
||||||
|
lm_labels,
|
||||||
|
sequence_labels,
|
||||||
|
is_impossible_labels,
|
||||||
|
)
|
||||||
|
|
||||||
|
def set_seed(self):
|
||||||
|
random.seed(self.seed)
|
||||||
|
tf.random.set_seed(self.seed)
|
||||||
|
|
||||||
|
def create_and_check_xlnet_base_model(
|
||||||
|
self,
|
||||||
|
config,
|
||||||
|
input_ids_1,
|
||||||
|
input_ids_2,
|
||||||
|
input_ids_q,
|
||||||
|
perm_mask,
|
||||||
|
input_mask,
|
||||||
|
target_mapping,
|
||||||
|
segment_ids,
|
||||||
|
lm_labels,
|
||||||
|
sequence_labels,
|
||||||
|
is_impossible_labels,
|
||||||
|
):
|
||||||
|
model = TFXLNetModel(config)
|
||||||
|
|
||||||
|
inputs = {"input_ids": input_ids_1, "input_mask": input_mask, "token_type_ids": segment_ids}
|
||||||
|
|
||||||
|
_, _ = model(inputs)
|
||||||
|
|
||||||
|
inputs = [input_ids_1, input_mask]
|
||||||
|
|
||||||
|
outputs, mems_1 = model(inputs)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"mems_1": [mem.numpy() for mem in mems_1],
|
||||||
|
"outputs": outputs.numpy(),
|
||||||
|
}
|
||||||
|
|
||||||
|
config.mem_len = 0
|
||||||
|
model = TFXLNetModel(config)
|
||||||
|
no_mems_outputs = model(inputs)
|
||||||
|
self.parent.assertEqual(len(no_mems_outputs), 1)
|
||||||
|
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["outputs"].shape), [self.batch_size, self.seq_length, self.hidden_size]
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(list(mem.shape) for mem in result["mems_1"]),
|
||||||
|
[[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_and_check_xlnet_lm_head(
|
||||||
|
self,
|
||||||
|
config,
|
||||||
|
input_ids_1,
|
||||||
|
input_ids_2,
|
||||||
|
input_ids_q,
|
||||||
|
perm_mask,
|
||||||
|
input_mask,
|
||||||
|
target_mapping,
|
||||||
|
segment_ids,
|
||||||
|
lm_labels,
|
||||||
|
sequence_labels,
|
||||||
|
is_impossible_labels,
|
||||||
|
):
|
||||||
|
model = TFXLNetLMHeadModel(config)
|
||||||
|
|
||||||
|
inputs_1 = {"input_ids": input_ids_1, "token_type_ids": segment_ids}
|
||||||
|
|
||||||
|
all_logits_1, mems_1 = model(inputs_1)
|
||||||
|
|
||||||
|
inputs_2 = {"input_ids": input_ids_2, "mems": mems_1, "token_type_ids": segment_ids}
|
||||||
|
|
||||||
|
all_logits_2, mems_2 = model(inputs_2)
|
||||||
|
|
||||||
|
inputs_3 = {"input_ids": input_ids_q, "perm_mask": perm_mask, "target_mapping": target_mapping}
|
||||||
|
|
||||||
|
logits, _ = model(inputs_3)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"mems_1": [mem.numpy() for mem in mems_1],
|
||||||
|
"all_logits_1": all_logits_1.numpy(),
|
||||||
|
"mems_2": [mem.numpy() for mem in mems_2],
|
||||||
|
"all_logits_2": all_logits_2.numpy(),
|
||||||
|
}
|
||||||
|
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["all_logits_1"].shape), [self.batch_size, self.seq_length, self.vocab_size]
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(list(mem.shape) for mem in result["mems_1"]),
|
||||||
|
[[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["all_logits_2"].shape), [self.batch_size, self.seq_length, self.vocab_size]
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(list(mem.shape) for mem in result["mems_2"]),
|
||||||
|
[[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_and_check_xlnet_qa(
|
||||||
|
self,
|
||||||
|
config,
|
||||||
|
input_ids_1,
|
||||||
|
input_ids_2,
|
||||||
|
input_ids_q,
|
||||||
|
perm_mask,
|
||||||
|
input_mask,
|
||||||
|
target_mapping,
|
||||||
|
segment_ids,
|
||||||
|
lm_labels,
|
||||||
|
sequence_labels,
|
||||||
|
is_impossible_labels,
|
||||||
|
):
|
||||||
|
model = TFXLNetForQuestionAnsweringSimple(config)
|
||||||
|
|
||||||
|
inputs = {"input_ids": input_ids_1, "attention_mask": input_mask, "token_type_ids": segment_ids}
|
||||||
|
start_logits, end_logits, mems = model(inputs)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"start_logits": start_logits.numpy(),
|
||||||
|
"end_logits": end_logits.numpy(),
|
||||||
|
"mems": [m.numpy() for m in mems],
|
||||||
|
}
|
||||||
|
|
||||||
|
self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
|
||||||
|
self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(list(mem.shape) for mem in result["mems"]),
|
||||||
|
[[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_and_check_xlnet_sequence_classif(
|
||||||
|
self,
|
||||||
|
config,
|
||||||
|
input_ids_1,
|
||||||
|
input_ids_2,
|
||||||
|
input_ids_q,
|
||||||
|
perm_mask,
|
||||||
|
input_mask,
|
||||||
|
target_mapping,
|
||||||
|
segment_ids,
|
||||||
|
lm_labels,
|
||||||
|
sequence_labels,
|
||||||
|
is_impossible_labels,
|
||||||
|
):
|
||||||
|
model = TFXLNetForSequenceClassification(config)
|
||||||
|
|
||||||
|
logits, mems_1 = model(input_ids_1)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"mems_1": [mem.numpy() for mem in mems_1],
|
||||||
|
"logits": logits.numpy(),
|
||||||
|
}
|
||||||
|
|
||||||
|
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.type_sequence_label_size])
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(list(mem.shape) for mem in result["mems_1"]),
|
||||||
|
[[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_and_check_xlnet_for_token_classification(
|
||||||
|
self,
|
||||||
|
config,
|
||||||
|
input_ids_1,
|
||||||
|
input_ids_2,
|
||||||
|
input_ids_q,
|
||||||
|
perm_mask,
|
||||||
|
input_mask,
|
||||||
|
target_mapping,
|
||||||
|
segment_ids,
|
||||||
|
lm_labels,
|
||||||
|
sequence_labels,
|
||||||
|
is_impossible_labels,
|
||||||
|
):
|
||||||
|
config.num_labels = input_ids_1.shape[1]
|
||||||
|
model = TFXLNetForTokenClassification(config)
|
||||||
|
inputs = {
|
||||||
|
"input_ids": input_ids_1,
|
||||||
|
"attention_mask": input_mask,
|
||||||
|
# 'token_type_ids': token_type_ids
|
||||||
|
}
|
||||||
|
logits, mems_1 = model(inputs)
|
||||||
|
result = {
|
||||||
|
"mems_1": [mem.numpy() for mem in mems_1],
|
||||||
|
"logits": logits.numpy(),
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["logits"].shape), [self.batch_size, self.seq_length, config.num_labels]
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(list(mem.shape) for mem in result["mems_1"]),
|
||||||
|
[[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
||||||
|
)
|
||||||
|
|
||||||
|
def prepare_config_and_inputs_for_common(self):
|
||||||
|
config_and_inputs = self.prepare_config_and_inputs()
|
||||||
|
(
|
||||||
|
config,
|
||||||
|
input_ids_1,
|
||||||
|
input_ids_2,
|
||||||
|
input_ids_q,
|
||||||
|
perm_mask,
|
||||||
|
input_mask,
|
||||||
|
target_mapping,
|
||||||
|
segment_ids,
|
||||||
|
lm_labels,
|
||||||
|
sequence_labels,
|
||||||
|
is_impossible_labels,
|
||||||
|
) = config_and_inputs
|
||||||
|
inputs_dict = {"input_ids": input_ids_1}
|
||||||
|
return config, inputs_dict
|
||||||
|
|
||||||
|
|
||||||
@require_tf
|
@require_tf
|
||||||
class TFXLNetModelTest(TFModelTesterMixin, unittest.TestCase):
|
class TFXLNetModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||||
|
|
||||||
|
@ -56,329 +354,8 @@ class TFXLNetModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||||
) # TODO (PVP): Check other models whether language generation is also applicable
|
) # TODO (PVP): Check other models whether language generation is also applicable
|
||||||
test_pruning = False
|
test_pruning = False
|
||||||
|
|
||||||
class TFXLNetModelTester(object):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
parent,
|
|
||||||
batch_size=13,
|
|
||||||
seq_length=7,
|
|
||||||
mem_len=10,
|
|
||||||
clamp_len=-1,
|
|
||||||
reuse_len=15,
|
|
||||||
is_training=True,
|
|
||||||
use_labels=True,
|
|
||||||
vocab_size=99,
|
|
||||||
cutoffs=[10, 50, 80],
|
|
||||||
hidden_size=32,
|
|
||||||
num_attention_heads=4,
|
|
||||||
d_inner=128,
|
|
||||||
num_hidden_layers=5,
|
|
||||||
type_sequence_label_size=2,
|
|
||||||
untie_r=True,
|
|
||||||
bi_data=False,
|
|
||||||
same_length=False,
|
|
||||||
initializer_range=0.05,
|
|
||||||
seed=1,
|
|
||||||
type_vocab_size=2,
|
|
||||||
bos_token_id=1,
|
|
||||||
eos_token_id=2,
|
|
||||||
pad_token_id=5,
|
|
||||||
):
|
|
||||||
self.parent = parent
|
|
||||||
self.batch_size = batch_size
|
|
||||||
self.seq_length = seq_length
|
|
||||||
self.mem_len = mem_len
|
|
||||||
# self.key_len = seq_length + mem_len
|
|
||||||
self.clamp_len = clamp_len
|
|
||||||
self.reuse_len = reuse_len
|
|
||||||
self.is_training = is_training
|
|
||||||
self.use_labels = use_labels
|
|
||||||
self.vocab_size = vocab_size
|
|
||||||
self.cutoffs = cutoffs
|
|
||||||
self.hidden_size = hidden_size
|
|
||||||
self.num_attention_heads = num_attention_heads
|
|
||||||
self.d_inner = d_inner
|
|
||||||
self.num_hidden_layers = num_hidden_layers
|
|
||||||
self.bi_data = bi_data
|
|
||||||
self.untie_r = untie_r
|
|
||||||
self.same_length = same_length
|
|
||||||
self.initializer_range = initializer_range
|
|
||||||
self.seed = seed
|
|
||||||
self.type_vocab_size = type_vocab_size
|
|
||||||
self.type_sequence_label_size = type_sequence_label_size
|
|
||||||
self.bos_token_id = bos_token_id
|
|
||||||
self.pad_token_id = pad_token_id
|
|
||||||
self.eos_token_id = eos_token_id
|
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
|
||||||
input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
|
||||||
input_ids_2 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
|
||||||
segment_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
|
||||||
input_mask = ids_tensor([self.batch_size, self.seq_length], 2, dtype=tf.float32)
|
|
||||||
|
|
||||||
input_ids_q = ids_tensor([self.batch_size, self.seq_length + 1], self.vocab_size)
|
|
||||||
perm_mask = tf.zeros((self.batch_size, self.seq_length + 1, self.seq_length), dtype=tf.float32)
|
|
||||||
perm_mask_last = tf.ones((self.batch_size, self.seq_length + 1, 1), dtype=tf.float32)
|
|
||||||
perm_mask = tf.concat([perm_mask, perm_mask_last], axis=-1)
|
|
||||||
# perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token
|
|
||||||
target_mapping = tf.zeros((self.batch_size, 1, self.seq_length), dtype=tf.float32)
|
|
||||||
target_mapping_last = tf.ones((self.batch_size, 1, 1), dtype=tf.float32)
|
|
||||||
target_mapping = tf.concat([target_mapping, target_mapping_last], axis=-1)
|
|
||||||
# target_mapping[:, 0, -1] = 1.0 # predict last token
|
|
||||||
|
|
||||||
sequence_labels = None
|
|
||||||
lm_labels = None
|
|
||||||
is_impossible_labels = None
|
|
||||||
if self.use_labels:
|
|
||||||
lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
|
||||||
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
|
||||||
is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
|
|
||||||
|
|
||||||
config = XLNetConfig(
|
|
||||||
vocab_size=self.vocab_size,
|
|
||||||
d_model=self.hidden_size,
|
|
||||||
n_head=self.num_attention_heads,
|
|
||||||
d_inner=self.d_inner,
|
|
||||||
n_layer=self.num_hidden_layers,
|
|
||||||
untie_r=self.untie_r,
|
|
||||||
mem_len=self.mem_len,
|
|
||||||
clamp_len=self.clamp_len,
|
|
||||||
same_length=self.same_length,
|
|
||||||
reuse_len=self.reuse_len,
|
|
||||||
bi_data=self.bi_data,
|
|
||||||
initializer_range=self.initializer_range,
|
|
||||||
num_labels=self.type_sequence_label_size,
|
|
||||||
bos_token_id=self.bos_token_id,
|
|
||||||
pad_token_id=self.pad_token_id,
|
|
||||||
eos_token_id=self.eos_token_id,
|
|
||||||
)
|
|
||||||
|
|
||||||
return (
|
|
||||||
config,
|
|
||||||
input_ids_1,
|
|
||||||
input_ids_2,
|
|
||||||
input_ids_q,
|
|
||||||
perm_mask,
|
|
||||||
input_mask,
|
|
||||||
target_mapping,
|
|
||||||
segment_ids,
|
|
||||||
lm_labels,
|
|
||||||
sequence_labels,
|
|
||||||
is_impossible_labels,
|
|
||||||
)
|
|
||||||
|
|
||||||
def set_seed(self):
|
|
||||||
random.seed(self.seed)
|
|
||||||
tf.random.set_seed(self.seed)
|
|
||||||
|
|
||||||
def create_and_check_xlnet_base_model(
|
|
||||||
self,
|
|
||||||
config,
|
|
||||||
input_ids_1,
|
|
||||||
input_ids_2,
|
|
||||||
input_ids_q,
|
|
||||||
perm_mask,
|
|
||||||
input_mask,
|
|
||||||
target_mapping,
|
|
||||||
segment_ids,
|
|
||||||
lm_labels,
|
|
||||||
sequence_labels,
|
|
||||||
is_impossible_labels,
|
|
||||||
):
|
|
||||||
model = TFXLNetModel(config)
|
|
||||||
|
|
||||||
inputs = {"input_ids": input_ids_1, "input_mask": input_mask, "token_type_ids": segment_ids}
|
|
||||||
|
|
||||||
_, _ = model(inputs)
|
|
||||||
|
|
||||||
inputs = [input_ids_1, input_mask]
|
|
||||||
|
|
||||||
outputs, mems_1 = model(inputs)
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"mems_1": [mem.numpy() for mem in mems_1],
|
|
||||||
"outputs": outputs.numpy(),
|
|
||||||
}
|
|
||||||
|
|
||||||
config.mem_len = 0
|
|
||||||
model = TFXLNetModel(config)
|
|
||||||
no_mems_outputs = model(inputs)
|
|
||||||
self.parent.assertEqual(len(no_mems_outputs), 1)
|
|
||||||
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["outputs"].shape), [self.batch_size, self.seq_length, self.hidden_size]
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(list(mem.shape) for mem in result["mems_1"]),
|
|
||||||
[[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_xlnet_lm_head(
|
|
||||||
self,
|
|
||||||
config,
|
|
||||||
input_ids_1,
|
|
||||||
input_ids_2,
|
|
||||||
input_ids_q,
|
|
||||||
perm_mask,
|
|
||||||
input_mask,
|
|
||||||
target_mapping,
|
|
||||||
segment_ids,
|
|
||||||
lm_labels,
|
|
||||||
sequence_labels,
|
|
||||||
is_impossible_labels,
|
|
||||||
):
|
|
||||||
model = TFXLNetLMHeadModel(config)
|
|
||||||
|
|
||||||
inputs_1 = {"input_ids": input_ids_1, "token_type_ids": segment_ids}
|
|
||||||
|
|
||||||
all_logits_1, mems_1 = model(inputs_1)
|
|
||||||
|
|
||||||
inputs_2 = {"input_ids": input_ids_2, "mems": mems_1, "token_type_ids": segment_ids}
|
|
||||||
|
|
||||||
all_logits_2, mems_2 = model(inputs_2)
|
|
||||||
|
|
||||||
inputs_3 = {"input_ids": input_ids_q, "perm_mask": perm_mask, "target_mapping": target_mapping}
|
|
||||||
|
|
||||||
logits, _ = model(inputs_3)
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"mems_1": [mem.numpy() for mem in mems_1],
|
|
||||||
"all_logits_1": all_logits_1.numpy(),
|
|
||||||
"mems_2": [mem.numpy() for mem in mems_2],
|
|
||||||
"all_logits_2": all_logits_2.numpy(),
|
|
||||||
}
|
|
||||||
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["all_logits_1"].shape), [self.batch_size, self.seq_length, self.vocab_size]
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(list(mem.shape) for mem in result["mems_1"]),
|
|
||||||
[[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
|
||||||
)
|
|
||||||
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["all_logits_2"].shape), [self.batch_size, self.seq_length, self.vocab_size]
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(list(mem.shape) for mem in result["mems_2"]),
|
|
||||||
[[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_xlnet_qa(
|
|
||||||
self,
|
|
||||||
config,
|
|
||||||
input_ids_1,
|
|
||||||
input_ids_2,
|
|
||||||
input_ids_q,
|
|
||||||
perm_mask,
|
|
||||||
input_mask,
|
|
||||||
target_mapping,
|
|
||||||
segment_ids,
|
|
||||||
lm_labels,
|
|
||||||
sequence_labels,
|
|
||||||
is_impossible_labels,
|
|
||||||
):
|
|
||||||
model = TFXLNetForQuestionAnsweringSimple(config)
|
|
||||||
|
|
||||||
inputs = {"input_ids": input_ids_1, "attention_mask": input_mask, "token_type_ids": segment_ids}
|
|
||||||
start_logits, end_logits, mems = model(inputs)
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"start_logits": start_logits.numpy(),
|
|
||||||
"end_logits": end_logits.numpy(),
|
|
||||||
"mems": [m.numpy() for m in mems],
|
|
||||||
}
|
|
||||||
|
|
||||||
self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
|
|
||||||
self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(list(mem.shape) for mem in result["mems"]),
|
|
||||||
[[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_xlnet_sequence_classif(
|
|
||||||
self,
|
|
||||||
config,
|
|
||||||
input_ids_1,
|
|
||||||
input_ids_2,
|
|
||||||
input_ids_q,
|
|
||||||
perm_mask,
|
|
||||||
input_mask,
|
|
||||||
target_mapping,
|
|
||||||
segment_ids,
|
|
||||||
lm_labels,
|
|
||||||
sequence_labels,
|
|
||||||
is_impossible_labels,
|
|
||||||
):
|
|
||||||
model = TFXLNetForSequenceClassification(config)
|
|
||||||
|
|
||||||
logits, mems_1 = model(input_ids_1)
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"mems_1": [mem.numpy() for mem in mems_1],
|
|
||||||
"logits": logits.numpy(),
|
|
||||||
}
|
|
||||||
|
|
||||||
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.type_sequence_label_size])
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(list(mem.shape) for mem in result["mems_1"]),
|
|
||||||
[[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_xlnet_for_token_classification(
|
|
||||||
self,
|
|
||||||
config,
|
|
||||||
input_ids_1,
|
|
||||||
input_ids_2,
|
|
||||||
input_ids_q,
|
|
||||||
perm_mask,
|
|
||||||
input_mask,
|
|
||||||
target_mapping,
|
|
||||||
segment_ids,
|
|
||||||
lm_labels,
|
|
||||||
sequence_labels,
|
|
||||||
is_impossible_labels,
|
|
||||||
):
|
|
||||||
config.num_labels = input_ids_1.shape[1]
|
|
||||||
model = TFXLNetForTokenClassification(config)
|
|
||||||
inputs = {
|
|
||||||
"input_ids": input_ids_1,
|
|
||||||
"attention_mask": input_mask,
|
|
||||||
# 'token_type_ids': token_type_ids
|
|
||||||
}
|
|
||||||
logits, mems_1 = model(inputs)
|
|
||||||
result = {
|
|
||||||
"mems_1": [mem.numpy() for mem in mems_1],
|
|
||||||
"logits": logits.numpy(),
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["logits"].shape), [self.batch_size, self.seq_length, config.num_labels]
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(list(mem.shape) for mem in result["mems_1"]),
|
|
||||||
[[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
|
||||||
)
|
|
||||||
|
|
||||||
def prepare_config_and_inputs_for_common(self):
|
|
||||||
config_and_inputs = self.prepare_config_and_inputs()
|
|
||||||
(
|
|
||||||
config,
|
|
||||||
input_ids_1,
|
|
||||||
input_ids_2,
|
|
||||||
input_ids_q,
|
|
||||||
perm_mask,
|
|
||||||
input_mask,
|
|
||||||
target_mapping,
|
|
||||||
segment_ids,
|
|
||||||
lm_labels,
|
|
||||||
sequence_labels,
|
|
||||||
is_impossible_labels,
|
|
||||||
) = config_and_inputs
|
|
||||||
inputs_dict = {"input_ids": input_ids_1}
|
|
||||||
return config, inputs_dict
|
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = TFXLNetModelTest.TFXLNetModelTester(self)
|
self.model_tester = TFXLNetModelTester(self)
|
||||||
self.config_tester = ConfigTester(self, config_class=XLNetConfig, d_inner=37)
|
self.config_tester = ConfigTester(self, config_class=XLNetConfig, d_inner=37)
|
||||||
|
|
||||||
def test_config(self):
|
def test_config(self):
|
||||||
|
|
|
@ -29,6 +29,137 @@ if is_torch_available():
|
||||||
from transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST
|
from transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST
|
||||||
|
|
||||||
|
|
||||||
|
class TransfoXLModelTester:
|
||||||
|
def __init__(
|
||||||
|
self, parent,
|
||||||
|
):
|
||||||
|
self.parent = parent
|
||||||
|
self.batch_size = 14
|
||||||
|
self.seq_length = 7
|
||||||
|
self.mem_len = 30
|
||||||
|
self.key_length = self.seq_length + self.mem_len
|
||||||
|
self.clamp_len = 15
|
||||||
|
self.is_training = True
|
||||||
|
self.use_labels = True
|
||||||
|
self.vocab_size = 99
|
||||||
|
self.cutoffs = [10, 50, 80]
|
||||||
|
self.hidden_size = 32
|
||||||
|
self.d_embed = 32
|
||||||
|
self.num_attention_heads = 4
|
||||||
|
self.d_head = 8
|
||||||
|
self.d_inner = 128
|
||||||
|
self.div_val = 2
|
||||||
|
self.num_hidden_layers = 5
|
||||||
|
self.scope = None
|
||||||
|
self.seed = 1
|
||||||
|
self.eos_token_id = 0
|
||||||
|
|
||||||
|
def prepare_config_and_inputs(self):
|
||||||
|
input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
input_ids_2 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
|
||||||
|
lm_labels = None
|
||||||
|
if self.use_labels:
|
||||||
|
lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
|
||||||
|
config = TransfoXLConfig(
|
||||||
|
vocab_size=self.vocab_size,
|
||||||
|
mem_len=self.mem_len,
|
||||||
|
clamp_len=self.clamp_len,
|
||||||
|
cutoffs=self.cutoffs,
|
||||||
|
d_model=self.hidden_size,
|
||||||
|
d_embed=self.d_embed,
|
||||||
|
n_head=self.num_attention_heads,
|
||||||
|
d_head=self.d_head,
|
||||||
|
d_inner=self.d_inner,
|
||||||
|
div_val=self.div_val,
|
||||||
|
n_layer=self.num_hidden_layers,
|
||||||
|
eos_token_id=self.eos_token_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
return (config, input_ids_1, input_ids_2, lm_labels)
|
||||||
|
|
||||||
|
def set_seed(self):
|
||||||
|
random.seed(self.seed)
|
||||||
|
torch.manual_seed(self.seed)
|
||||||
|
|
||||||
|
def create_transfo_xl_model(self, config, input_ids_1, input_ids_2, lm_labels):
|
||||||
|
model = TransfoXLModel(config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
hidden_states_1, mems_1 = model(input_ids_1)
|
||||||
|
hidden_states_2, mems_2 = model(input_ids_2, mems_1)
|
||||||
|
outputs = {
|
||||||
|
"hidden_states_1": hidden_states_1,
|
||||||
|
"mems_1": mems_1,
|
||||||
|
"hidden_states_2": hidden_states_2,
|
||||||
|
"mems_2": mems_2,
|
||||||
|
}
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
def check_transfo_xl_model_output(self, result):
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["hidden_states_1"].size()), [self.batch_size, self.seq_length, self.hidden_size],
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["hidden_states_2"].size()), [self.batch_size, self.seq_length, self.hidden_size],
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(list(mem.size()) for mem in result["mems_1"]),
|
||||||
|
[[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(list(mem.size()) for mem in result["mems_2"]),
|
||||||
|
[[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels):
|
||||||
|
model = TransfoXLLMHeadModel(config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
lm_logits_1, mems_1 = model(input_ids_1)
|
||||||
|
loss_1, _, mems_1 = model(input_ids_1, labels=lm_labels)
|
||||||
|
lm_logits_2, mems_2 = model(input_ids_2, mems=mems_1)
|
||||||
|
loss_2, _, mems_2 = model(input_ids_2, labels=lm_labels, mems=mems_1)
|
||||||
|
|
||||||
|
outputs = {
|
||||||
|
"loss_1": loss_1,
|
||||||
|
"mems_1": mems_1,
|
||||||
|
"lm_logits_1": lm_logits_1,
|
||||||
|
"loss_2": loss_2,
|
||||||
|
"mems_2": mems_2,
|
||||||
|
"lm_logits_2": lm_logits_2,
|
||||||
|
}
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
def check_transfo_xl_lm_head_output(self, result):
|
||||||
|
self.parent.assertListEqual(list(result["loss_1"].size()), [self.batch_size, self.seq_length - 1])
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["lm_logits_1"].size()), [self.batch_size, self.seq_length, self.vocab_size],
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(list(mem.size()) for mem in result["mems_1"]),
|
||||||
|
[[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.parent.assertListEqual(list(result["loss_2"].size()), [self.batch_size, self.seq_length - 1])
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["lm_logits_2"].size()), [self.batch_size, self.seq_length, self.vocab_size],
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(list(mem.size()) for mem in result["mems_2"]),
|
||||||
|
[[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
||||||
|
)
|
||||||
|
|
||||||
|
def prepare_config_and_inputs_for_common(self):
|
||||||
|
config_and_inputs = self.prepare_config_and_inputs()
|
||||||
|
(config, input_ids_1, input_ids_2, lm_labels) = config_and_inputs
|
||||||
|
inputs_dict = {"input_ids": input_ids_1}
|
||||||
|
return config, inputs_dict
|
||||||
|
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
class TransfoXLModelTest(ModelTesterMixin, unittest.TestCase):
|
class TransfoXLModelTest(ModelTesterMixin, unittest.TestCase):
|
||||||
|
|
||||||
|
@ -38,155 +169,6 @@ class TransfoXLModelTest(ModelTesterMixin, unittest.TestCase):
|
||||||
test_torchscript = False
|
test_torchscript = False
|
||||||
test_resize_embeddings = True
|
test_resize_embeddings = True
|
||||||
|
|
||||||
class TransfoXLModelTester(object):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
parent,
|
|
||||||
batch_size=14,
|
|
||||||
seq_length=7,
|
|
||||||
mem_len=30,
|
|
||||||
clamp_len=15,
|
|
||||||
is_training=True,
|
|
||||||
use_labels=True,
|
|
||||||
vocab_size=99,
|
|
||||||
cutoffs=[10, 50, 80],
|
|
||||||
hidden_size=32,
|
|
||||||
d_embed=32,
|
|
||||||
num_attention_heads=4,
|
|
||||||
d_head=8,
|
|
||||||
d_inner=128,
|
|
||||||
div_val=2,
|
|
||||||
num_hidden_layers=5,
|
|
||||||
scope=None,
|
|
||||||
seed=1,
|
|
||||||
eos_token_id=0,
|
|
||||||
):
|
|
||||||
self.parent = parent
|
|
||||||
self.batch_size = batch_size
|
|
||||||
self.seq_length = seq_length
|
|
||||||
self.mem_len = mem_len
|
|
||||||
self.key_length = seq_length + mem_len
|
|
||||||
self.clamp_len = clamp_len
|
|
||||||
self.is_training = is_training
|
|
||||||
self.use_labels = use_labels
|
|
||||||
self.vocab_size = vocab_size
|
|
||||||
self.cutoffs = cutoffs
|
|
||||||
self.hidden_size = hidden_size
|
|
||||||
self.d_embed = d_embed
|
|
||||||
self.num_attention_heads = num_attention_heads
|
|
||||||
self.d_head = d_head
|
|
||||||
self.d_inner = d_inner
|
|
||||||
self.div_val = div_val
|
|
||||||
self.num_hidden_layers = num_hidden_layers
|
|
||||||
self.scope = scope
|
|
||||||
self.seed = seed
|
|
||||||
self.eos_token_id = eos_token_id
|
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
|
||||||
input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
|
||||||
input_ids_2 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
|
||||||
|
|
||||||
lm_labels = None
|
|
||||||
if self.use_labels:
|
|
||||||
lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
|
||||||
|
|
||||||
config = TransfoXLConfig(
|
|
||||||
vocab_size=self.vocab_size,
|
|
||||||
mem_len=self.mem_len,
|
|
||||||
clamp_len=self.clamp_len,
|
|
||||||
cutoffs=self.cutoffs,
|
|
||||||
d_model=self.hidden_size,
|
|
||||||
d_embed=self.d_embed,
|
|
||||||
n_head=self.num_attention_heads,
|
|
||||||
d_head=self.d_head,
|
|
||||||
d_inner=self.d_inner,
|
|
||||||
div_val=self.div_val,
|
|
||||||
n_layer=self.num_hidden_layers,
|
|
||||||
eos_token_id=self.eos_token_id,
|
|
||||||
)
|
|
||||||
|
|
||||||
return (config, input_ids_1, input_ids_2, lm_labels)
|
|
||||||
|
|
||||||
def set_seed(self):
|
|
||||||
random.seed(self.seed)
|
|
||||||
torch.manual_seed(self.seed)
|
|
||||||
|
|
||||||
def create_transfo_xl_model(self, config, input_ids_1, input_ids_2, lm_labels):
|
|
||||||
model = TransfoXLModel(config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
hidden_states_1, mems_1 = model(input_ids_1)
|
|
||||||
hidden_states_2, mems_2 = model(input_ids_2, mems_1)
|
|
||||||
outputs = {
|
|
||||||
"hidden_states_1": hidden_states_1,
|
|
||||||
"mems_1": mems_1,
|
|
||||||
"hidden_states_2": hidden_states_2,
|
|
||||||
"mems_2": mems_2,
|
|
||||||
}
|
|
||||||
return outputs
|
|
||||||
|
|
||||||
def check_transfo_xl_model_output(self, result):
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["hidden_states_1"].size()), [self.batch_size, self.seq_length, self.hidden_size],
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["hidden_states_2"].size()), [self.batch_size, self.seq_length, self.hidden_size],
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(list(mem.size()) for mem in result["mems_1"]),
|
|
||||||
[[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(list(mem.size()) for mem in result["mems_2"]),
|
|
||||||
[[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels):
|
|
||||||
model = TransfoXLLMHeadModel(config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
lm_logits_1, mems_1 = model(input_ids_1)
|
|
||||||
loss_1, _, mems_1 = model(input_ids_1, labels=lm_labels)
|
|
||||||
lm_logits_2, mems_2 = model(input_ids_2, mems=mems_1)
|
|
||||||
loss_2, _, mems_2 = model(input_ids_2, labels=lm_labels, mems=mems_1)
|
|
||||||
|
|
||||||
outputs = {
|
|
||||||
"loss_1": loss_1,
|
|
||||||
"mems_1": mems_1,
|
|
||||||
"lm_logits_1": lm_logits_1,
|
|
||||||
"loss_2": loss_2,
|
|
||||||
"mems_2": mems_2,
|
|
||||||
"lm_logits_2": lm_logits_2,
|
|
||||||
}
|
|
||||||
return outputs
|
|
||||||
|
|
||||||
def check_transfo_xl_lm_head_output(self, result):
|
|
||||||
self.parent.assertListEqual(list(result["loss_1"].size()), [self.batch_size, self.seq_length - 1])
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["lm_logits_1"].size()), [self.batch_size, self.seq_length, self.vocab_size],
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(list(mem.size()) for mem in result["mems_1"]),
|
|
||||||
[[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
|
||||||
)
|
|
||||||
|
|
||||||
self.parent.assertListEqual(list(result["loss_2"].size()), [self.batch_size, self.seq_length - 1])
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["lm_logits_2"].size()), [self.batch_size, self.seq_length, self.vocab_size],
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(list(mem.size()) for mem in result["mems_2"]),
|
|
||||||
[[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
|
||||||
)
|
|
||||||
|
|
||||||
def prepare_config_and_inputs_for_common(self):
|
|
||||||
config_and_inputs = self.prepare_config_and_inputs()
|
|
||||||
(config, input_ids_1, input_ids_2, lm_labels) = config_and_inputs
|
|
||||||
inputs_dict = {"input_ids": input_ids_1}
|
|
||||||
return config, inputs_dict
|
|
||||||
|
|
||||||
def check_cutoffs_and_n_token(
|
def check_cutoffs_and_n_token(
|
||||||
self, copied_cutoffs, layer, model_embed, model, model_class, resized_value, vocab_size
|
self, copied_cutoffs, layer, model_embed, model, model_class, resized_value, vocab_size
|
||||||
):
|
):
|
||||||
|
@ -210,7 +192,7 @@ class TransfoXLModelTest(ModelTesterMixin, unittest.TestCase):
|
||||||
self.assertEqual(model.crit.n_token, vocab_size + resized_value)
|
self.assertEqual(model.crit.n_token, vocab_size + resized_value)
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = TransfoXLModelTest.TransfoXLModelTester(self)
|
self.model_tester = TransfoXLModelTester(self)
|
||||||
self.config_tester = ConfigTester(self, config_class=TransfoXLConfig, d_embed=37)
|
self.config_tester = ConfigTester(self, config_class=TransfoXLConfig, d_embed=37)
|
||||||
|
|
||||||
def test_config(self):
|
def test_config(self):
|
||||||
|
|
|
@ -37,6 +37,306 @@ if is_torch_available():
|
||||||
from transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_LIST
|
from transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_LIST
|
||||||
|
|
||||||
|
|
||||||
|
class XLMModelTester:
|
||||||
|
def __init__(
|
||||||
|
self, parent,
|
||||||
|
):
|
||||||
|
self.parent = parent
|
||||||
|
self.batch_size = 13
|
||||||
|
self.seq_length = 7
|
||||||
|
self.is_training = True
|
||||||
|
self.use_input_lengths = True
|
||||||
|
self.use_token_type_ids = True
|
||||||
|
self.use_labels = True
|
||||||
|
self.gelu_activation = True
|
||||||
|
self.sinusoidal_embeddings = False
|
||||||
|
self.causal = False
|
||||||
|
self.asm = False
|
||||||
|
self.n_langs = 2
|
||||||
|
self.vocab_size = 99
|
||||||
|
self.n_special = 0
|
||||||
|
self.hidden_size = 32
|
||||||
|
self.num_hidden_layers = 5
|
||||||
|
self.num_attention_heads = 4
|
||||||
|
self.hidden_dropout_prob = 0.1
|
||||||
|
self.attention_probs_dropout_prob = 0.1
|
||||||
|
self.max_position_embeddings = 512
|
||||||
|
self.type_sequence_label_size = 2
|
||||||
|
self.initializer_range = 0.02
|
||||||
|
self.num_labels = 3
|
||||||
|
self.num_choices = 4
|
||||||
|
self.summary_type = "last"
|
||||||
|
self.use_proj = True
|
||||||
|
self.scope = None
|
||||||
|
self.bos_token_id = 0
|
||||||
|
|
||||||
|
def prepare_config_and_inputs(self):
|
||||||
|
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
input_mask = ids_tensor([self.batch_size, self.seq_length], 2).float()
|
||||||
|
|
||||||
|
input_lengths = None
|
||||||
|
if self.use_input_lengths:
|
||||||
|
input_lengths = (
|
||||||
|
ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2
|
||||||
|
) # small variation of seq_length
|
||||||
|
|
||||||
|
token_type_ids = None
|
||||||
|
if self.use_token_type_ids:
|
||||||
|
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.n_langs)
|
||||||
|
|
||||||
|
sequence_labels = None
|
||||||
|
token_labels = None
|
||||||
|
is_impossible_labels = None
|
||||||
|
if self.use_labels:
|
||||||
|
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||||
|
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
||||||
|
is_impossible_labels = ids_tensor([self.batch_size], 2).float()
|
||||||
|
|
||||||
|
config = XLMConfig(
|
||||||
|
vocab_size=self.vocab_size,
|
||||||
|
n_special=self.n_special,
|
||||||
|
emb_dim=self.hidden_size,
|
||||||
|
n_layers=self.num_hidden_layers,
|
||||||
|
n_heads=self.num_attention_heads,
|
||||||
|
dropout=self.hidden_dropout_prob,
|
||||||
|
attention_dropout=self.attention_probs_dropout_prob,
|
||||||
|
gelu_activation=self.gelu_activation,
|
||||||
|
sinusoidal_embeddings=self.sinusoidal_embeddings,
|
||||||
|
asm=self.asm,
|
||||||
|
causal=self.causal,
|
||||||
|
n_langs=self.n_langs,
|
||||||
|
max_position_embeddings=self.max_position_embeddings,
|
||||||
|
initializer_range=self.initializer_range,
|
||||||
|
summary_type=self.summary_type,
|
||||||
|
use_proj=self.use_proj,
|
||||||
|
bos_token_id=self.bos_token_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
return (
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
token_type_ids,
|
||||||
|
input_lengths,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
is_impossible_labels,
|
||||||
|
input_mask,
|
||||||
|
)
|
||||||
|
|
||||||
|
def check_loss_output(self, result):
|
||||||
|
self.parent.assertListEqual(list(result["loss"].size()), [])
|
||||||
|
|
||||||
|
def create_and_check_xlm_model(
|
||||||
|
self,
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
token_type_ids,
|
||||||
|
input_lengths,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
is_impossible_labels,
|
||||||
|
input_mask,
|
||||||
|
):
|
||||||
|
model = XLMModel(config=config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
outputs = model(input_ids, lengths=input_lengths, langs=token_type_ids)
|
||||||
|
outputs = model(input_ids, langs=token_type_ids)
|
||||||
|
outputs = model(input_ids)
|
||||||
|
sequence_output = outputs[0]
|
||||||
|
result = {
|
||||||
|
"sequence_output": sequence_output,
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_and_check_xlm_lm_head(
|
||||||
|
self,
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
token_type_ids,
|
||||||
|
input_lengths,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
is_impossible_labels,
|
||||||
|
input_mask,
|
||||||
|
):
|
||||||
|
model = XLMWithLMHeadModel(config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
loss, logits = model(input_ids, token_type_ids=token_type_ids, labels=token_labels)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"loss": loss,
|
||||||
|
"logits": logits,
|
||||||
|
}
|
||||||
|
|
||||||
|
self.parent.assertListEqual(list(result["loss"].size()), [])
|
||||||
|
self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size])
|
||||||
|
|
||||||
|
def create_and_check_xlm_simple_qa(
|
||||||
|
self,
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
token_type_ids,
|
||||||
|
input_lengths,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
is_impossible_labels,
|
||||||
|
input_mask,
|
||||||
|
):
|
||||||
|
model = XLMForQuestionAnsweringSimple(config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
outputs = model(input_ids)
|
||||||
|
|
||||||
|
outputs = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels)
|
||||||
|
loss, start_logits, end_logits = outputs
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"loss": loss,
|
||||||
|
"start_logits": start_logits,
|
||||||
|
"end_logits": end_logits,
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length])
|
||||||
|
self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length])
|
||||||
|
self.check_loss_output(result)
|
||||||
|
|
||||||
|
def create_and_check_xlm_qa(
|
||||||
|
self,
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
token_type_ids,
|
||||||
|
input_lengths,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
is_impossible_labels,
|
||||||
|
input_mask,
|
||||||
|
):
|
||||||
|
model = XLMForQuestionAnswering(config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
outputs = model(input_ids)
|
||||||
|
start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = outputs
|
||||||
|
|
||||||
|
outputs = model(
|
||||||
|
input_ids,
|
||||||
|
start_positions=sequence_labels,
|
||||||
|
end_positions=sequence_labels,
|
||||||
|
cls_index=sequence_labels,
|
||||||
|
is_impossible=is_impossible_labels,
|
||||||
|
p_mask=input_mask,
|
||||||
|
)
|
||||||
|
|
||||||
|
outputs = model(
|
||||||
|
input_ids,
|
||||||
|
start_positions=sequence_labels,
|
||||||
|
end_positions=sequence_labels,
|
||||||
|
cls_index=sequence_labels,
|
||||||
|
is_impossible=is_impossible_labels,
|
||||||
|
)
|
||||||
|
|
||||||
|
(total_loss,) = outputs
|
||||||
|
|
||||||
|
outputs = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels)
|
||||||
|
|
||||||
|
(total_loss,) = outputs
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"loss": total_loss,
|
||||||
|
"start_top_log_probs": start_top_log_probs,
|
||||||
|
"start_top_index": start_top_index,
|
||||||
|
"end_top_log_probs": end_top_log_probs,
|
||||||
|
"end_top_index": end_top_index,
|
||||||
|
"cls_logits": cls_logits,
|
||||||
|
}
|
||||||
|
|
||||||
|
self.parent.assertListEqual(list(result["loss"].size()), [])
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["start_top_log_probs"].size()), [self.batch_size, model.config.start_n_top]
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["start_top_index"].size()), [self.batch_size, model.config.start_n_top]
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["end_top_log_probs"].size()),
|
||||||
|
[self.batch_size, model.config.start_n_top * model.config.end_n_top],
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["end_top_index"].size()), [self.batch_size, model.config.start_n_top * model.config.end_n_top],
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(list(result["cls_logits"].size()), [self.batch_size])
|
||||||
|
|
||||||
|
def create_and_check_xlm_sequence_classif(
|
||||||
|
self,
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
token_type_ids,
|
||||||
|
input_lengths,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
is_impossible_labels,
|
||||||
|
input_mask,
|
||||||
|
):
|
||||||
|
model = XLMForSequenceClassification(config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
(logits,) = model(input_ids)
|
||||||
|
loss, logits = model(input_ids, labels=sequence_labels)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"loss": loss,
|
||||||
|
"logits": logits,
|
||||||
|
}
|
||||||
|
|
||||||
|
self.parent.assertListEqual(list(result["loss"].size()), [])
|
||||||
|
self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.type_sequence_label_size])
|
||||||
|
|
||||||
|
def create_and_check_xlm_for_token_classification(
|
||||||
|
self,
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
token_type_ids,
|
||||||
|
input_lengths,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
is_impossible_labels,
|
||||||
|
input_mask,
|
||||||
|
):
|
||||||
|
config.num_labels = self.num_labels
|
||||||
|
model = XLMForTokenClassification(config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
loss, logits = model(input_ids, attention_mask=input_mask, labels=token_labels)
|
||||||
|
result = {
|
||||||
|
"loss": loss,
|
||||||
|
"logits": logits,
|
||||||
|
}
|
||||||
|
self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels])
|
||||||
|
self.check_loss_output(result)
|
||||||
|
|
||||||
|
def prepare_config_and_inputs_for_common(self):
|
||||||
|
config_and_inputs = self.prepare_config_and_inputs()
|
||||||
|
(
|
||||||
|
config,
|
||||||
|
input_ids,
|
||||||
|
token_type_ids,
|
||||||
|
input_lengths,
|
||||||
|
sequence_labels,
|
||||||
|
token_labels,
|
||||||
|
is_impossible_labels,
|
||||||
|
input_mask,
|
||||||
|
) = config_and_inputs
|
||||||
|
inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "lengths": input_lengths}
|
||||||
|
return config, inputs_dict
|
||||||
|
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
class XLMModelTest(ModelTesterMixin, unittest.TestCase):
|
class XLMModelTest(ModelTesterMixin, unittest.TestCase):
|
||||||
|
|
||||||
|
@ -55,345 +355,8 @@ class XLMModelTest(ModelTesterMixin, unittest.TestCase):
|
||||||
(XLMWithLMHeadModel,) if is_torch_available() else ()
|
(XLMWithLMHeadModel,) if is_torch_available() else ()
|
||||||
) # TODO (PVP): Check other models whether language generation is also applicable
|
) # TODO (PVP): Check other models whether language generation is also applicable
|
||||||
|
|
||||||
class XLMModelTester(object):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
parent,
|
|
||||||
batch_size=13,
|
|
||||||
seq_length=7,
|
|
||||||
is_training=True,
|
|
||||||
use_input_lengths=True,
|
|
||||||
use_token_type_ids=True,
|
|
||||||
use_labels=True,
|
|
||||||
gelu_activation=True,
|
|
||||||
sinusoidal_embeddings=False,
|
|
||||||
causal=False,
|
|
||||||
asm=False,
|
|
||||||
n_langs=2,
|
|
||||||
vocab_size=99,
|
|
||||||
n_special=0,
|
|
||||||
hidden_size=32,
|
|
||||||
num_hidden_layers=5,
|
|
||||||
num_attention_heads=4,
|
|
||||||
hidden_dropout_prob=0.1,
|
|
||||||
attention_probs_dropout_prob=0.1,
|
|
||||||
max_position_embeddings=512,
|
|
||||||
type_vocab_size=16,
|
|
||||||
type_sequence_label_size=2,
|
|
||||||
initializer_range=0.02,
|
|
||||||
num_labels=3,
|
|
||||||
num_choices=4,
|
|
||||||
summary_type="last",
|
|
||||||
use_proj=True,
|
|
||||||
scope=None,
|
|
||||||
bos_token_id=0,
|
|
||||||
):
|
|
||||||
self.parent = parent
|
|
||||||
self.batch_size = batch_size
|
|
||||||
self.seq_length = seq_length
|
|
||||||
self.is_training = is_training
|
|
||||||
self.use_input_lengths = use_input_lengths
|
|
||||||
self.use_token_type_ids = use_token_type_ids
|
|
||||||
self.use_labels = use_labels
|
|
||||||
self.gelu_activation = gelu_activation
|
|
||||||
self.sinusoidal_embeddings = sinusoidal_embeddings
|
|
||||||
self.asm = asm
|
|
||||||
self.n_langs = n_langs
|
|
||||||
self.vocab_size = vocab_size
|
|
||||||
self.n_special = n_special
|
|
||||||
self.summary_type = summary_type
|
|
||||||
self.causal = causal
|
|
||||||
self.use_proj = use_proj
|
|
||||||
self.hidden_size = hidden_size
|
|
||||||
self.num_hidden_layers = num_hidden_layers
|
|
||||||
self.num_attention_heads = num_attention_heads
|
|
||||||
self.hidden_dropout_prob = hidden_dropout_prob
|
|
||||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
|
||||||
self.max_position_embeddings = max_position_embeddings
|
|
||||||
self.n_langs = n_langs
|
|
||||||
self.type_sequence_label_size = type_sequence_label_size
|
|
||||||
self.initializer_range = initializer_range
|
|
||||||
self.summary_type = summary_type
|
|
||||||
self.num_labels = num_labels
|
|
||||||
self.num_choices = num_choices
|
|
||||||
self.scope = scope
|
|
||||||
self.bos_token_id = bos_token_id
|
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
|
||||||
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
|
||||||
input_mask = ids_tensor([self.batch_size, self.seq_length], 2).float()
|
|
||||||
|
|
||||||
input_lengths = None
|
|
||||||
if self.use_input_lengths:
|
|
||||||
input_lengths = (
|
|
||||||
ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2
|
|
||||||
) # small variation of seq_length
|
|
||||||
|
|
||||||
token_type_ids = None
|
|
||||||
if self.use_token_type_ids:
|
|
||||||
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.n_langs)
|
|
||||||
|
|
||||||
sequence_labels = None
|
|
||||||
token_labels = None
|
|
||||||
is_impossible_labels = None
|
|
||||||
if self.use_labels:
|
|
||||||
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
|
||||||
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
|
||||||
is_impossible_labels = ids_tensor([self.batch_size], 2).float()
|
|
||||||
|
|
||||||
config = XLMConfig(
|
|
||||||
vocab_size=self.vocab_size,
|
|
||||||
n_special=self.n_special,
|
|
||||||
emb_dim=self.hidden_size,
|
|
||||||
n_layers=self.num_hidden_layers,
|
|
||||||
n_heads=self.num_attention_heads,
|
|
||||||
dropout=self.hidden_dropout_prob,
|
|
||||||
attention_dropout=self.attention_probs_dropout_prob,
|
|
||||||
gelu_activation=self.gelu_activation,
|
|
||||||
sinusoidal_embeddings=self.sinusoidal_embeddings,
|
|
||||||
asm=self.asm,
|
|
||||||
causal=self.causal,
|
|
||||||
n_langs=self.n_langs,
|
|
||||||
max_position_embeddings=self.max_position_embeddings,
|
|
||||||
initializer_range=self.initializer_range,
|
|
||||||
summary_type=self.summary_type,
|
|
||||||
use_proj=self.use_proj,
|
|
||||||
bos_token_id=self.bos_token_id,
|
|
||||||
)
|
|
||||||
|
|
||||||
return (
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
token_type_ids,
|
|
||||||
input_lengths,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
is_impossible_labels,
|
|
||||||
input_mask,
|
|
||||||
)
|
|
||||||
|
|
||||||
def check_loss_output(self, result):
|
|
||||||
self.parent.assertListEqual(list(result["loss"].size()), [])
|
|
||||||
|
|
||||||
def create_and_check_xlm_model(
|
|
||||||
self,
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
token_type_ids,
|
|
||||||
input_lengths,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
is_impossible_labels,
|
|
||||||
input_mask,
|
|
||||||
):
|
|
||||||
model = XLMModel(config=config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
outputs = model(input_ids, lengths=input_lengths, langs=token_type_ids)
|
|
||||||
outputs = model(input_ids, langs=token_type_ids)
|
|
||||||
outputs = model(input_ids)
|
|
||||||
sequence_output = outputs[0]
|
|
||||||
result = {
|
|
||||||
"sequence_output": sequence_output,
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_xlm_lm_head(
|
|
||||||
self,
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
token_type_ids,
|
|
||||||
input_lengths,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
is_impossible_labels,
|
|
||||||
input_mask,
|
|
||||||
):
|
|
||||||
model = XLMWithLMHeadModel(config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
loss, logits = model(input_ids, token_type_ids=token_type_ids, labels=token_labels)
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"loss": loss,
|
|
||||||
"logits": logits,
|
|
||||||
}
|
|
||||||
|
|
||||||
self.parent.assertListEqual(list(result["loss"].size()), [])
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_xlm_simple_qa(
|
|
||||||
self,
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
token_type_ids,
|
|
||||||
input_lengths,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
is_impossible_labels,
|
|
||||||
input_mask,
|
|
||||||
):
|
|
||||||
model = XLMForQuestionAnsweringSimple(config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
outputs = model(input_ids)
|
|
||||||
|
|
||||||
outputs = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels)
|
|
||||||
loss, start_logits, end_logits = outputs
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"loss": loss,
|
|
||||||
"start_logits": start_logits,
|
|
||||||
"end_logits": end_logits,
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length])
|
|
||||||
self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length])
|
|
||||||
self.check_loss_output(result)
|
|
||||||
|
|
||||||
def create_and_check_xlm_qa(
|
|
||||||
self,
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
token_type_ids,
|
|
||||||
input_lengths,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
is_impossible_labels,
|
|
||||||
input_mask,
|
|
||||||
):
|
|
||||||
model = XLMForQuestionAnswering(config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
outputs = model(input_ids)
|
|
||||||
start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = outputs
|
|
||||||
|
|
||||||
outputs = model(
|
|
||||||
input_ids,
|
|
||||||
start_positions=sequence_labels,
|
|
||||||
end_positions=sequence_labels,
|
|
||||||
cls_index=sequence_labels,
|
|
||||||
is_impossible=is_impossible_labels,
|
|
||||||
p_mask=input_mask,
|
|
||||||
)
|
|
||||||
|
|
||||||
outputs = model(
|
|
||||||
input_ids,
|
|
||||||
start_positions=sequence_labels,
|
|
||||||
end_positions=sequence_labels,
|
|
||||||
cls_index=sequence_labels,
|
|
||||||
is_impossible=is_impossible_labels,
|
|
||||||
)
|
|
||||||
|
|
||||||
(total_loss,) = outputs
|
|
||||||
|
|
||||||
outputs = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels)
|
|
||||||
|
|
||||||
(total_loss,) = outputs
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"loss": total_loss,
|
|
||||||
"start_top_log_probs": start_top_log_probs,
|
|
||||||
"start_top_index": start_top_index,
|
|
||||||
"end_top_log_probs": end_top_log_probs,
|
|
||||||
"end_top_index": end_top_index,
|
|
||||||
"cls_logits": cls_logits,
|
|
||||||
}
|
|
||||||
|
|
||||||
self.parent.assertListEqual(list(result["loss"].size()), [])
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["start_top_log_probs"].size()), [self.batch_size, model.config.start_n_top]
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["start_top_index"].size()), [self.batch_size, model.config.start_n_top]
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["end_top_log_probs"].size()),
|
|
||||||
[self.batch_size, model.config.start_n_top * model.config.end_n_top],
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["end_top_index"].size()),
|
|
||||||
[self.batch_size, model.config.start_n_top * model.config.end_n_top],
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(list(result["cls_logits"].size()), [self.batch_size])
|
|
||||||
|
|
||||||
def create_and_check_xlm_sequence_classif(
|
|
||||||
self,
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
token_type_ids,
|
|
||||||
input_lengths,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
is_impossible_labels,
|
|
||||||
input_mask,
|
|
||||||
):
|
|
||||||
model = XLMForSequenceClassification(config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
(logits,) = model(input_ids)
|
|
||||||
loss, logits = model(input_ids, labels=sequence_labels)
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"loss": loss,
|
|
||||||
"logits": logits,
|
|
||||||
}
|
|
||||||
|
|
||||||
self.parent.assertListEqual(list(result["loss"].size()), [])
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["logits"].size()), [self.batch_size, self.type_sequence_label_size]
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_xlm_for_token_classification(
|
|
||||||
self,
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
token_type_ids,
|
|
||||||
input_lengths,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
is_impossible_labels,
|
|
||||||
input_mask,
|
|
||||||
):
|
|
||||||
config.num_labels = self.num_labels
|
|
||||||
model = XLMForTokenClassification(config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
loss, logits = model(input_ids, attention_mask=input_mask, labels=token_labels)
|
|
||||||
result = {
|
|
||||||
"loss": loss,
|
|
||||||
"logits": logits,
|
|
||||||
}
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]
|
|
||||||
)
|
|
||||||
self.check_loss_output(result)
|
|
||||||
|
|
||||||
def prepare_config_and_inputs_for_common(self):
|
|
||||||
config_and_inputs = self.prepare_config_and_inputs()
|
|
||||||
(
|
|
||||||
config,
|
|
||||||
input_ids,
|
|
||||||
token_type_ids,
|
|
||||||
input_lengths,
|
|
||||||
sequence_labels,
|
|
||||||
token_labels,
|
|
||||||
is_impossible_labels,
|
|
||||||
input_mask,
|
|
||||||
) = config_and_inputs
|
|
||||||
inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "lengths": input_lengths}
|
|
||||||
return config, inputs_dict
|
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = XLMModelTest.XLMModelTester(self)
|
self.model_tester = XLMModelTester(self)
|
||||||
self.config_tester = ConfigTester(self, config_class=XLMConfig, emb_dim=37)
|
self.config_tester = ConfigTester(self, config_class=XLMConfig, emb_dim=37)
|
||||||
|
|
||||||
def test_config(self):
|
def test_config(self):
|
||||||
|
|
|
@ -39,6 +39,415 @@ if is_torch_available():
|
||||||
from transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_LIST
|
from transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_LIST
|
||||||
|
|
||||||
|
|
||||||
|
class XLNetModelTester:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
parent,
|
||||||
|
batch_size=14,
|
||||||
|
seq_length=7,
|
||||||
|
mem_len=10,
|
||||||
|
clamp_len=-1,
|
||||||
|
reuse_len=15,
|
||||||
|
is_training=True,
|
||||||
|
use_labels=True,
|
||||||
|
vocab_size=99,
|
||||||
|
cutoffs=[10, 50, 80],
|
||||||
|
hidden_size=32,
|
||||||
|
num_attention_heads=4,
|
||||||
|
d_inner=128,
|
||||||
|
num_hidden_layers=5,
|
||||||
|
type_sequence_label_size=2,
|
||||||
|
untie_r=True,
|
||||||
|
bi_data=False,
|
||||||
|
same_length=False,
|
||||||
|
initializer_range=0.05,
|
||||||
|
seed=1,
|
||||||
|
type_vocab_size=2,
|
||||||
|
bos_token_id=1,
|
||||||
|
eos_token_id=2,
|
||||||
|
pad_token_id=5,
|
||||||
|
num_choices=4,
|
||||||
|
):
|
||||||
|
self.parent = parent
|
||||||
|
self.batch_size = 14
|
||||||
|
self.seq_length = 7
|
||||||
|
self.mem_len = 10
|
||||||
|
# self.key_len = seq_length + mem_len
|
||||||
|
self.clamp_len = -1
|
||||||
|
self.reuse_len = 15
|
||||||
|
self.is_training = True
|
||||||
|
self.use_labels = True
|
||||||
|
self.vocab_size = 99
|
||||||
|
self.cutoffs = [10, 50, 80]
|
||||||
|
self.hidden_size = 32
|
||||||
|
self.num_attention_heads = 4
|
||||||
|
self.d_inner = 128
|
||||||
|
self.num_hidden_layers = 5
|
||||||
|
self.type_sequence_label_size = 2
|
||||||
|
self.untie_r = True
|
||||||
|
self.bi_data = False
|
||||||
|
self.same_length = False
|
||||||
|
self.initializer_range = 0.05
|
||||||
|
self.seed = 1
|
||||||
|
self.type_vocab_size = 2
|
||||||
|
self.bos_token_id = 1
|
||||||
|
self.eos_token_id = 2
|
||||||
|
self.pad_token_id = 5
|
||||||
|
self.num_choices = 4
|
||||||
|
|
||||||
|
def prepare_config_and_inputs(self):
|
||||||
|
input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
input_ids_2 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
segment_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
||||||
|
input_mask = ids_tensor([self.batch_size, self.seq_length], 2).float()
|
||||||
|
|
||||||
|
input_ids_q = ids_tensor([self.batch_size, self.seq_length + 1], self.vocab_size)
|
||||||
|
perm_mask = torch.zeros(
|
||||||
|
self.batch_size, self.seq_length + 1, self.seq_length + 1, dtype=torch.float, device=torch_device,
|
||||||
|
)
|
||||||
|
perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token
|
||||||
|
target_mapping = torch.zeros(self.batch_size, 1, self.seq_length + 1, dtype=torch.float, device=torch_device,)
|
||||||
|
target_mapping[:, 0, -1] = 1.0 # predict last token
|
||||||
|
|
||||||
|
sequence_labels = None
|
||||||
|
lm_labels = None
|
||||||
|
is_impossible_labels = None
|
||||||
|
token_labels = None
|
||||||
|
if self.use_labels:
|
||||||
|
lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||||
|
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||||
|
is_impossible_labels = ids_tensor([self.batch_size], 2).float()
|
||||||
|
token_labels = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
||||||
|
|
||||||
|
config = XLNetConfig(
|
||||||
|
vocab_size=self.vocab_size,
|
||||||
|
d_model=self.hidden_size,
|
||||||
|
n_head=self.num_attention_heads,
|
||||||
|
d_inner=self.d_inner,
|
||||||
|
n_layer=self.num_hidden_layers,
|
||||||
|
untie_r=self.untie_r,
|
||||||
|
mem_len=self.mem_len,
|
||||||
|
clamp_len=self.clamp_len,
|
||||||
|
same_length=self.same_length,
|
||||||
|
reuse_len=self.reuse_len,
|
||||||
|
bi_data=self.bi_data,
|
||||||
|
initializer_range=self.initializer_range,
|
||||||
|
num_labels=self.type_sequence_label_size,
|
||||||
|
bos_token_id=self.bos_token_id,
|
||||||
|
pad_token_id=self.pad_token_id,
|
||||||
|
eos_token_id=self.eos_token_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
return (
|
||||||
|
config,
|
||||||
|
input_ids_1,
|
||||||
|
input_ids_2,
|
||||||
|
input_ids_q,
|
||||||
|
perm_mask,
|
||||||
|
input_mask,
|
||||||
|
target_mapping,
|
||||||
|
segment_ids,
|
||||||
|
lm_labels,
|
||||||
|
sequence_labels,
|
||||||
|
is_impossible_labels,
|
||||||
|
token_labels,
|
||||||
|
)
|
||||||
|
|
||||||
|
def set_seed(self):
|
||||||
|
random.seed(self.seed)
|
||||||
|
torch.manual_seed(self.seed)
|
||||||
|
|
||||||
|
def create_and_check_xlnet_base_model(
|
||||||
|
self,
|
||||||
|
config,
|
||||||
|
input_ids_1,
|
||||||
|
input_ids_2,
|
||||||
|
input_ids_q,
|
||||||
|
perm_mask,
|
||||||
|
input_mask,
|
||||||
|
target_mapping,
|
||||||
|
segment_ids,
|
||||||
|
lm_labels,
|
||||||
|
sequence_labels,
|
||||||
|
is_impossible_labels,
|
||||||
|
token_labels,
|
||||||
|
):
|
||||||
|
model = XLNetModel(config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
_, _ = model(input_ids_1, input_mask=input_mask)
|
||||||
|
_, _ = model(input_ids_1, attention_mask=input_mask)
|
||||||
|
_, _ = model(input_ids_1, token_type_ids=segment_ids)
|
||||||
|
outputs, mems_1 = model(input_ids_1)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"mems_1": mems_1,
|
||||||
|
"outputs": outputs,
|
||||||
|
}
|
||||||
|
|
||||||
|
config.mem_len = 0
|
||||||
|
model = XLNetModel(config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
no_mems_outputs = model(input_ids_1)
|
||||||
|
self.parent.assertEqual(len(no_mems_outputs), 1)
|
||||||
|
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["outputs"].size()), [self.batch_size, self.seq_length, self.hidden_size],
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(list(mem.size()) for mem in result["mems_1"]),
|
||||||
|
[[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_and_check_xlnet_base_model_with_att_output(
|
||||||
|
self,
|
||||||
|
config,
|
||||||
|
input_ids_1,
|
||||||
|
input_ids_2,
|
||||||
|
input_ids_q,
|
||||||
|
perm_mask,
|
||||||
|
input_mask,
|
||||||
|
target_mapping,
|
||||||
|
segment_ids,
|
||||||
|
lm_labels,
|
||||||
|
sequence_labels,
|
||||||
|
is_impossible_labels,
|
||||||
|
token_labels,
|
||||||
|
):
|
||||||
|
model = XLNetModel(config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
_, _, attentions = model(input_ids_1, target_mapping=target_mapping, output_attentions=True)
|
||||||
|
|
||||||
|
self.parent.assertEqual(len(attentions), config.n_layer)
|
||||||
|
self.parent.assertIsInstance(attentions[0], tuple)
|
||||||
|
self.parent.assertEqual(len(attentions[0]), 2)
|
||||||
|
self.parent.assertTrue(attentions[0][0].shape, attentions[0][0].shape)
|
||||||
|
|
||||||
|
def create_and_check_xlnet_lm_head(
|
||||||
|
self,
|
||||||
|
config,
|
||||||
|
input_ids_1,
|
||||||
|
input_ids_2,
|
||||||
|
input_ids_q,
|
||||||
|
perm_mask,
|
||||||
|
input_mask,
|
||||||
|
target_mapping,
|
||||||
|
segment_ids,
|
||||||
|
lm_labels,
|
||||||
|
sequence_labels,
|
||||||
|
is_impossible_labels,
|
||||||
|
token_labels,
|
||||||
|
):
|
||||||
|
model = XLNetLMHeadModel(config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
loss_1, all_logits_1, mems_1 = model(input_ids_1, token_type_ids=segment_ids, labels=lm_labels)
|
||||||
|
|
||||||
|
loss_2, all_logits_2, mems_2 = model(input_ids_2, token_type_ids=segment_ids, labels=lm_labels, mems=mems_1)
|
||||||
|
|
||||||
|
logits, _ = model(input_ids_q, perm_mask=perm_mask, target_mapping=target_mapping)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"loss_1": loss_1,
|
||||||
|
"mems_1": mems_1,
|
||||||
|
"all_logits_1": all_logits_1,
|
||||||
|
"loss_2": loss_2,
|
||||||
|
"mems_2": mems_2,
|
||||||
|
"all_logits_2": all_logits_2,
|
||||||
|
}
|
||||||
|
|
||||||
|
self.parent.assertListEqual(list(result["loss_1"].size()), [])
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["all_logits_1"].size()), [self.batch_size, self.seq_length, self.vocab_size],
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(list(mem.size()) for mem in result["mems_1"]),
|
||||||
|
[[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.parent.assertListEqual(list(result["loss_2"].size()), [])
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["all_logits_2"].size()), [self.batch_size, self.seq_length, self.vocab_size],
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(list(mem.size()) for mem in result["mems_2"]),
|
||||||
|
[[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_and_check_xlnet_qa(
|
||||||
|
self,
|
||||||
|
config,
|
||||||
|
input_ids_1,
|
||||||
|
input_ids_2,
|
||||||
|
input_ids_q,
|
||||||
|
perm_mask,
|
||||||
|
input_mask,
|
||||||
|
target_mapping,
|
||||||
|
segment_ids,
|
||||||
|
lm_labels,
|
||||||
|
sequence_labels,
|
||||||
|
is_impossible_labels,
|
||||||
|
token_labels,
|
||||||
|
):
|
||||||
|
model = XLNetForQuestionAnswering(config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
outputs = model(input_ids_1)
|
||||||
|
(start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits, mems,) = outputs
|
||||||
|
|
||||||
|
outputs = model(
|
||||||
|
input_ids_1,
|
||||||
|
start_positions=sequence_labels,
|
||||||
|
end_positions=sequence_labels,
|
||||||
|
cls_index=sequence_labels,
|
||||||
|
is_impossible=is_impossible_labels,
|
||||||
|
p_mask=input_mask,
|
||||||
|
)
|
||||||
|
|
||||||
|
outputs = model(
|
||||||
|
input_ids_1,
|
||||||
|
start_positions=sequence_labels,
|
||||||
|
end_positions=sequence_labels,
|
||||||
|
cls_index=sequence_labels,
|
||||||
|
is_impossible=is_impossible_labels,
|
||||||
|
)
|
||||||
|
|
||||||
|
total_loss, mems = outputs
|
||||||
|
|
||||||
|
outputs = model(input_ids_1, start_positions=sequence_labels, end_positions=sequence_labels,)
|
||||||
|
|
||||||
|
total_loss, mems = outputs
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"loss": total_loss,
|
||||||
|
"start_top_log_probs": start_top_log_probs,
|
||||||
|
"start_top_index": start_top_index,
|
||||||
|
"end_top_log_probs": end_top_log_probs,
|
||||||
|
"end_top_index": end_top_index,
|
||||||
|
"cls_logits": cls_logits,
|
||||||
|
"mems": mems,
|
||||||
|
}
|
||||||
|
|
||||||
|
self.parent.assertListEqual(list(result["loss"].size()), [])
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["start_top_log_probs"].size()), [self.batch_size, model.config.start_n_top],
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["start_top_index"].size()), [self.batch_size, model.config.start_n_top],
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["end_top_log_probs"].size()),
|
||||||
|
[self.batch_size, model.config.start_n_top * model.config.end_n_top],
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["end_top_index"].size()), [self.batch_size, model.config.start_n_top * model.config.end_n_top],
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(list(result["cls_logits"].size()), [self.batch_size])
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(list(mem.size()) for mem in result["mems"]),
|
||||||
|
[[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_and_check_xlnet_token_classif(
|
||||||
|
self,
|
||||||
|
config,
|
||||||
|
input_ids_1,
|
||||||
|
input_ids_2,
|
||||||
|
input_ids_q,
|
||||||
|
perm_mask,
|
||||||
|
input_mask,
|
||||||
|
target_mapping,
|
||||||
|
segment_ids,
|
||||||
|
lm_labels,
|
||||||
|
sequence_labels,
|
||||||
|
is_impossible_labels,
|
||||||
|
token_labels,
|
||||||
|
):
|
||||||
|
model = XLNetForTokenClassification(config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
logits, mems_1 = model(input_ids_1)
|
||||||
|
loss, logits, mems_1 = model(input_ids_1, labels=token_labels)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"loss": loss,
|
||||||
|
"mems_1": mems_1,
|
||||||
|
"logits": logits,
|
||||||
|
}
|
||||||
|
|
||||||
|
self.parent.assertListEqual(list(result["loss"].size()), [])
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["logits"].size()), [self.batch_size, self.seq_length, self.type_sequence_label_size],
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(list(mem.size()) for mem in result["mems_1"]),
|
||||||
|
[[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_and_check_xlnet_sequence_classif(
|
||||||
|
self,
|
||||||
|
config,
|
||||||
|
input_ids_1,
|
||||||
|
input_ids_2,
|
||||||
|
input_ids_q,
|
||||||
|
perm_mask,
|
||||||
|
input_mask,
|
||||||
|
target_mapping,
|
||||||
|
segment_ids,
|
||||||
|
lm_labels,
|
||||||
|
sequence_labels,
|
||||||
|
is_impossible_labels,
|
||||||
|
token_labels,
|
||||||
|
):
|
||||||
|
model = XLNetForSequenceClassification(config)
|
||||||
|
model.to(torch_device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
logits, mems_1 = model(input_ids_1)
|
||||||
|
loss, logits, mems_1 = model(input_ids_1, labels=sequence_labels)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"loss": loss,
|
||||||
|
"mems_1": mems_1,
|
||||||
|
"logits": logits,
|
||||||
|
}
|
||||||
|
|
||||||
|
self.parent.assertListEqual(list(result["loss"].size()), [])
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(result["logits"].size()), [self.batch_size, self.type_sequence_label_size],
|
||||||
|
)
|
||||||
|
self.parent.assertListEqual(
|
||||||
|
list(list(mem.size()) for mem in result["mems_1"]),
|
||||||
|
[[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
||||||
|
)
|
||||||
|
|
||||||
|
def prepare_config_and_inputs_for_common(self):
|
||||||
|
config_and_inputs = self.prepare_config_and_inputs()
|
||||||
|
(
|
||||||
|
config,
|
||||||
|
input_ids_1,
|
||||||
|
input_ids_2,
|
||||||
|
input_ids_q,
|
||||||
|
perm_mask,
|
||||||
|
input_mask,
|
||||||
|
target_mapping,
|
||||||
|
segment_ids,
|
||||||
|
lm_labels,
|
||||||
|
sequence_labels,
|
||||||
|
is_impossible_labels,
|
||||||
|
token_labels,
|
||||||
|
) = config_and_inputs
|
||||||
|
inputs_dict = {"input_ids": input_ids_1}
|
||||||
|
return config, inputs_dict
|
||||||
|
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
class XLNetModelTest(ModelTesterMixin, unittest.TestCase):
|
class XLNetModelTest(ModelTesterMixin, unittest.TestCase):
|
||||||
|
|
||||||
|
@ -59,421 +468,8 @@ class XLNetModelTest(ModelTesterMixin, unittest.TestCase):
|
||||||
) # TODO (PVP): Check other models whether language generation is also applicable
|
) # TODO (PVP): Check other models whether language generation is also applicable
|
||||||
test_pruning = False
|
test_pruning = False
|
||||||
|
|
||||||
class XLNetModelTester(object):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
parent,
|
|
||||||
batch_size=14,
|
|
||||||
seq_length=7,
|
|
||||||
mem_len=10,
|
|
||||||
clamp_len=-1,
|
|
||||||
reuse_len=15,
|
|
||||||
is_training=True,
|
|
||||||
use_labels=True,
|
|
||||||
vocab_size=99,
|
|
||||||
cutoffs=[10, 50, 80],
|
|
||||||
hidden_size=32,
|
|
||||||
num_attention_heads=4,
|
|
||||||
d_inner=128,
|
|
||||||
num_hidden_layers=5,
|
|
||||||
type_sequence_label_size=2,
|
|
||||||
untie_r=True,
|
|
||||||
bi_data=False,
|
|
||||||
same_length=False,
|
|
||||||
initializer_range=0.05,
|
|
||||||
seed=1,
|
|
||||||
type_vocab_size=2,
|
|
||||||
bos_token_id=1,
|
|
||||||
eos_token_id=2,
|
|
||||||
pad_token_id=5,
|
|
||||||
num_choices=4,
|
|
||||||
):
|
|
||||||
self.parent = parent
|
|
||||||
self.batch_size = batch_size
|
|
||||||
self.seq_length = seq_length
|
|
||||||
self.mem_len = mem_len
|
|
||||||
# self.key_len = seq_length + mem_len
|
|
||||||
self.clamp_len = clamp_len
|
|
||||||
self.reuse_len = reuse_len
|
|
||||||
self.is_training = is_training
|
|
||||||
self.use_labels = use_labels
|
|
||||||
self.vocab_size = vocab_size
|
|
||||||
self.cutoffs = cutoffs
|
|
||||||
self.hidden_size = hidden_size
|
|
||||||
self.num_attention_heads = num_attention_heads
|
|
||||||
self.d_inner = d_inner
|
|
||||||
self.num_hidden_layers = num_hidden_layers
|
|
||||||
self.bi_data = bi_data
|
|
||||||
self.untie_r = untie_r
|
|
||||||
self.same_length = same_length
|
|
||||||
self.initializer_range = initializer_range
|
|
||||||
self.seed = seed
|
|
||||||
self.type_vocab_size = type_vocab_size
|
|
||||||
self.type_sequence_label_size = type_sequence_label_size
|
|
||||||
self.bos_token_id = bos_token_id
|
|
||||||
self.pad_token_id = pad_token_id
|
|
||||||
self.eos_token_id = eos_token_id
|
|
||||||
self.num_choices = num_choices
|
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
|
||||||
input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
|
||||||
input_ids_2 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
|
||||||
segment_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
|
||||||
input_mask = ids_tensor([self.batch_size, self.seq_length], 2).float()
|
|
||||||
|
|
||||||
input_ids_q = ids_tensor([self.batch_size, self.seq_length + 1], self.vocab_size)
|
|
||||||
perm_mask = torch.zeros(
|
|
||||||
self.batch_size, self.seq_length + 1, self.seq_length + 1, dtype=torch.float, device=torch_device,
|
|
||||||
)
|
|
||||||
perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token
|
|
||||||
target_mapping = torch.zeros(
|
|
||||||
self.batch_size, 1, self.seq_length + 1, dtype=torch.float, device=torch_device,
|
|
||||||
)
|
|
||||||
target_mapping[:, 0, -1] = 1.0 # predict last token
|
|
||||||
|
|
||||||
sequence_labels = None
|
|
||||||
lm_labels = None
|
|
||||||
is_impossible_labels = None
|
|
||||||
token_labels = None
|
|
||||||
if self.use_labels:
|
|
||||||
lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
|
||||||
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
|
||||||
is_impossible_labels = ids_tensor([self.batch_size], 2).float()
|
|
||||||
token_labels = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
|
||||||
|
|
||||||
config = XLNetConfig(
|
|
||||||
vocab_size=self.vocab_size,
|
|
||||||
d_model=self.hidden_size,
|
|
||||||
n_head=self.num_attention_heads,
|
|
||||||
d_inner=self.d_inner,
|
|
||||||
n_layer=self.num_hidden_layers,
|
|
||||||
untie_r=self.untie_r,
|
|
||||||
mem_len=self.mem_len,
|
|
||||||
clamp_len=self.clamp_len,
|
|
||||||
same_length=self.same_length,
|
|
||||||
reuse_len=self.reuse_len,
|
|
||||||
bi_data=self.bi_data,
|
|
||||||
initializer_range=self.initializer_range,
|
|
||||||
num_labels=self.type_sequence_label_size,
|
|
||||||
bos_token_id=self.bos_token_id,
|
|
||||||
pad_token_id=self.pad_token_id,
|
|
||||||
eos_token_id=self.eos_token_id,
|
|
||||||
)
|
|
||||||
|
|
||||||
return (
|
|
||||||
config,
|
|
||||||
input_ids_1,
|
|
||||||
input_ids_2,
|
|
||||||
input_ids_q,
|
|
||||||
perm_mask,
|
|
||||||
input_mask,
|
|
||||||
target_mapping,
|
|
||||||
segment_ids,
|
|
||||||
lm_labels,
|
|
||||||
sequence_labels,
|
|
||||||
is_impossible_labels,
|
|
||||||
token_labels,
|
|
||||||
)
|
|
||||||
|
|
||||||
def set_seed(self):
|
|
||||||
random.seed(self.seed)
|
|
||||||
torch.manual_seed(self.seed)
|
|
||||||
|
|
||||||
def create_and_check_xlnet_base_model(
|
|
||||||
self,
|
|
||||||
config,
|
|
||||||
input_ids_1,
|
|
||||||
input_ids_2,
|
|
||||||
input_ids_q,
|
|
||||||
perm_mask,
|
|
||||||
input_mask,
|
|
||||||
target_mapping,
|
|
||||||
segment_ids,
|
|
||||||
lm_labels,
|
|
||||||
sequence_labels,
|
|
||||||
is_impossible_labels,
|
|
||||||
token_labels,
|
|
||||||
):
|
|
||||||
model = XLNetModel(config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
_, _ = model(input_ids_1, input_mask=input_mask)
|
|
||||||
_, _ = model(input_ids_1, attention_mask=input_mask)
|
|
||||||
_, _ = model(input_ids_1, token_type_ids=segment_ids)
|
|
||||||
outputs, mems_1 = model(input_ids_1)
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"mems_1": mems_1,
|
|
||||||
"outputs": outputs,
|
|
||||||
}
|
|
||||||
|
|
||||||
config.mem_len = 0
|
|
||||||
model = XLNetModel(config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
no_mems_outputs = model(input_ids_1)
|
|
||||||
self.parent.assertEqual(len(no_mems_outputs), 1)
|
|
||||||
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["outputs"].size()), [self.batch_size, self.seq_length, self.hidden_size],
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(list(mem.size()) for mem in result["mems_1"]),
|
|
||||||
[[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_xlnet_base_model_with_att_output(
|
|
||||||
self,
|
|
||||||
config,
|
|
||||||
input_ids_1,
|
|
||||||
input_ids_2,
|
|
||||||
input_ids_q,
|
|
||||||
perm_mask,
|
|
||||||
input_mask,
|
|
||||||
target_mapping,
|
|
||||||
segment_ids,
|
|
||||||
lm_labels,
|
|
||||||
sequence_labels,
|
|
||||||
is_impossible_labels,
|
|
||||||
token_labels,
|
|
||||||
):
|
|
||||||
model = XLNetModel(config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
_, _, attentions = model(input_ids_1, target_mapping=target_mapping, output_attentions=True)
|
|
||||||
|
|
||||||
self.parent.assertEqual(len(attentions), config.n_layer)
|
|
||||||
self.parent.assertIsInstance(attentions[0], tuple)
|
|
||||||
self.parent.assertEqual(len(attentions[0]), 2)
|
|
||||||
self.parent.assertTrue(attentions[0][0].shape, attentions[0][0].shape)
|
|
||||||
|
|
||||||
def create_and_check_xlnet_lm_head(
|
|
||||||
self,
|
|
||||||
config,
|
|
||||||
input_ids_1,
|
|
||||||
input_ids_2,
|
|
||||||
input_ids_q,
|
|
||||||
perm_mask,
|
|
||||||
input_mask,
|
|
||||||
target_mapping,
|
|
||||||
segment_ids,
|
|
||||||
lm_labels,
|
|
||||||
sequence_labels,
|
|
||||||
is_impossible_labels,
|
|
||||||
token_labels,
|
|
||||||
):
|
|
||||||
model = XLNetLMHeadModel(config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
loss_1, all_logits_1, mems_1 = model(input_ids_1, token_type_ids=segment_ids, labels=lm_labels)
|
|
||||||
|
|
||||||
loss_2, all_logits_2, mems_2 = model(
|
|
||||||
input_ids_2, token_type_ids=segment_ids, labels=lm_labels, mems=mems_1
|
|
||||||
)
|
|
||||||
|
|
||||||
logits, _ = model(input_ids_q, perm_mask=perm_mask, target_mapping=target_mapping)
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"loss_1": loss_1,
|
|
||||||
"mems_1": mems_1,
|
|
||||||
"all_logits_1": all_logits_1,
|
|
||||||
"loss_2": loss_2,
|
|
||||||
"mems_2": mems_2,
|
|
||||||
"all_logits_2": all_logits_2,
|
|
||||||
}
|
|
||||||
|
|
||||||
self.parent.assertListEqual(list(result["loss_1"].size()), [])
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["all_logits_1"].size()), [self.batch_size, self.seq_length, self.vocab_size],
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(list(mem.size()) for mem in result["mems_1"]),
|
|
||||||
[[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
|
||||||
)
|
|
||||||
|
|
||||||
self.parent.assertListEqual(list(result["loss_2"].size()), [])
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["all_logits_2"].size()), [self.batch_size, self.seq_length, self.vocab_size],
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(list(mem.size()) for mem in result["mems_2"]),
|
|
||||||
[[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_xlnet_qa(
|
|
||||||
self,
|
|
||||||
config,
|
|
||||||
input_ids_1,
|
|
||||||
input_ids_2,
|
|
||||||
input_ids_q,
|
|
||||||
perm_mask,
|
|
||||||
input_mask,
|
|
||||||
target_mapping,
|
|
||||||
segment_ids,
|
|
||||||
lm_labels,
|
|
||||||
sequence_labels,
|
|
||||||
is_impossible_labels,
|
|
||||||
token_labels,
|
|
||||||
):
|
|
||||||
model = XLNetForQuestionAnswering(config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
outputs = model(input_ids_1)
|
|
||||||
(start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits, mems,) = outputs
|
|
||||||
|
|
||||||
outputs = model(
|
|
||||||
input_ids_1,
|
|
||||||
start_positions=sequence_labels,
|
|
||||||
end_positions=sequence_labels,
|
|
||||||
cls_index=sequence_labels,
|
|
||||||
is_impossible=is_impossible_labels,
|
|
||||||
p_mask=input_mask,
|
|
||||||
)
|
|
||||||
|
|
||||||
outputs = model(
|
|
||||||
input_ids_1,
|
|
||||||
start_positions=sequence_labels,
|
|
||||||
end_positions=sequence_labels,
|
|
||||||
cls_index=sequence_labels,
|
|
||||||
is_impossible=is_impossible_labels,
|
|
||||||
)
|
|
||||||
|
|
||||||
total_loss, mems = outputs
|
|
||||||
|
|
||||||
outputs = model(input_ids_1, start_positions=sequence_labels, end_positions=sequence_labels,)
|
|
||||||
|
|
||||||
total_loss, mems = outputs
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"loss": total_loss,
|
|
||||||
"start_top_log_probs": start_top_log_probs,
|
|
||||||
"start_top_index": start_top_index,
|
|
||||||
"end_top_log_probs": end_top_log_probs,
|
|
||||||
"end_top_index": end_top_index,
|
|
||||||
"cls_logits": cls_logits,
|
|
||||||
"mems": mems,
|
|
||||||
}
|
|
||||||
|
|
||||||
self.parent.assertListEqual(list(result["loss"].size()), [])
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["start_top_log_probs"].size()), [self.batch_size, model.config.start_n_top],
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["start_top_index"].size()), [self.batch_size, model.config.start_n_top],
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["end_top_log_probs"].size()),
|
|
||||||
[self.batch_size, model.config.start_n_top * model.config.end_n_top],
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["end_top_index"].size()),
|
|
||||||
[self.batch_size, model.config.start_n_top * model.config.end_n_top],
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(list(result["cls_logits"].size()), [self.batch_size])
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(list(mem.size()) for mem in result["mems"]),
|
|
||||||
[[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_xlnet_token_classif(
|
|
||||||
self,
|
|
||||||
config,
|
|
||||||
input_ids_1,
|
|
||||||
input_ids_2,
|
|
||||||
input_ids_q,
|
|
||||||
perm_mask,
|
|
||||||
input_mask,
|
|
||||||
target_mapping,
|
|
||||||
segment_ids,
|
|
||||||
lm_labels,
|
|
||||||
sequence_labels,
|
|
||||||
is_impossible_labels,
|
|
||||||
token_labels,
|
|
||||||
):
|
|
||||||
model = XLNetForTokenClassification(config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
logits, mems_1 = model(input_ids_1)
|
|
||||||
loss, logits, mems_1 = model(input_ids_1, labels=token_labels)
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"loss": loss,
|
|
||||||
"mems_1": mems_1,
|
|
||||||
"logits": logits,
|
|
||||||
}
|
|
||||||
|
|
||||||
self.parent.assertListEqual(list(result["loss"].size()), [])
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["logits"].size()), [self.batch_size, self.seq_length, self.type_sequence_label_size],
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(list(mem.size()) for mem in result["mems_1"]),
|
|
||||||
[[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_and_check_xlnet_sequence_classif(
|
|
||||||
self,
|
|
||||||
config,
|
|
||||||
input_ids_1,
|
|
||||||
input_ids_2,
|
|
||||||
input_ids_q,
|
|
||||||
perm_mask,
|
|
||||||
input_mask,
|
|
||||||
target_mapping,
|
|
||||||
segment_ids,
|
|
||||||
lm_labels,
|
|
||||||
sequence_labels,
|
|
||||||
is_impossible_labels,
|
|
||||||
token_labels,
|
|
||||||
):
|
|
||||||
model = XLNetForSequenceClassification(config)
|
|
||||||
model.to(torch_device)
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
logits, mems_1 = model(input_ids_1)
|
|
||||||
loss, logits, mems_1 = model(input_ids_1, labels=sequence_labels)
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"loss": loss,
|
|
||||||
"mems_1": mems_1,
|
|
||||||
"logits": logits,
|
|
||||||
}
|
|
||||||
|
|
||||||
self.parent.assertListEqual(list(result["loss"].size()), [])
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(result["logits"].size()), [self.batch_size, self.type_sequence_label_size],
|
|
||||||
)
|
|
||||||
self.parent.assertListEqual(
|
|
||||||
list(list(mem.size()) for mem in result["mems_1"]),
|
|
||||||
[[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
|
|
||||||
)
|
|
||||||
|
|
||||||
def prepare_config_and_inputs_for_common(self):
|
|
||||||
config_and_inputs = self.prepare_config_and_inputs()
|
|
||||||
(
|
|
||||||
config,
|
|
||||||
input_ids_1,
|
|
||||||
input_ids_2,
|
|
||||||
input_ids_q,
|
|
||||||
perm_mask,
|
|
||||||
input_mask,
|
|
||||||
target_mapping,
|
|
||||||
segment_ids,
|
|
||||||
lm_labels,
|
|
||||||
sequence_labels,
|
|
||||||
is_impossible_labels,
|
|
||||||
token_labels,
|
|
||||||
) = config_and_inputs
|
|
||||||
inputs_dict = {"input_ids": input_ids_1}
|
|
||||||
return config, inputs_dict
|
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = XLNetModelTest.XLNetModelTester(self)
|
self.model_tester = XLNetModelTester(self)
|
||||||
self.config_tester = ConfigTester(self, config_class=XLNetConfig, d_inner=37)
|
self.config_tester = ConfigTester(self, config_class=XLNetConfig, d_inner=37)
|
||||||
|
|
||||||
def test_config(self):
|
def test_config(self):
|
||||||
|
|
Loading…
Reference in New Issue