transformers/tests/test_modeling_reformer.py

1142 lines
46 KiB
Python

# coding=utf-8 # Copyright 2020 Huggingface
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers import is_torch_available
from transformers.testing_utils import (
require_sentencepiece,
require_tokenizers,
require_torch,
require_torch_multigpu,
slow,
torch_device,
)
from .test_configuration_common import ConfigTester
from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
if is_torch_available():
import torch
from transformers import (
REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
ReformerConfig,
ReformerForMaskedLM,
ReformerForQuestionAnswering,
ReformerForSequenceClassification,
ReformerLayer,
ReformerModel,
ReformerModelWithLMHead,
ReformerTokenizer,
)
class ReformerModelTester:
def __init__(
self,
parent,
batch_size=None,
seq_length=None,
is_training=None,
is_decoder=None,
use_input_mask=None,
use_labels=None,
vocab_size=None,
attention_head_size=None,
hidden_size=None,
num_attention_heads=None,
local_attn_chunk_length=None,
local_num_chunks_before=None,
local_num_chunks_after=None,
num_buckets=None,
num_hashes=1,
lsh_attn_chunk_length=None,
lsh_num_chunks_before=None,
lsh_num_chunks_after=None,
chunk_size_lm_head=None,
chunk_size_feed_forward=None,
feed_forward_size=None,
hidden_act=None,
hidden_dropout_prob=None,
local_attention_probs_dropout_prob=None,
lsh_attention_probs_dropout_prob=None,
max_position_embeddings=None,
initializer_range=None,
axial_norm_std=None,
layer_norm_eps=None,
axial_pos_embds=None,
axial_pos_shape=None,
axial_pos_embds_dim=None,
attn_layers=None,
pad_token_id=None,
eos_token_id=None,
scope=None,
hash_seed=None,
num_labels=None,
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.is_decoder = is_decoder
self.use_input_mask = use_input_mask
self.use_labels = use_labels
self.vocab_size = vocab_size
self.attention_head_size = attention_head_size
self.hidden_size = hidden_size
self.num_attention_heads = num_attention_heads
self.num_hidden_layers = len(attn_layers)
self.local_attn_chunk_length = local_attn_chunk_length
self.local_num_chunks_after = local_num_chunks_after
self.local_num_chunks_before = local_num_chunks_before
self.num_hashes = num_hashes
self.num_buckets = tuple(num_buckets) if isinstance(num_buckets, list) else num_buckets
self.lsh_attn_chunk_length = lsh_attn_chunk_length
self.lsh_num_chunks_after = lsh_num_chunks_after
self.lsh_num_chunks_before = lsh_num_chunks_before
self.hidden_act = hidden_act
self.feed_forward_size = feed_forward_size
self.hidden_dropout_prob = hidden_dropout_prob
self.local_attention_probs_dropout_prob = local_attention_probs_dropout_prob
self.lsh_attention_probs_dropout_prob = lsh_attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.axial_pos_embds = axial_pos_embds
self.axial_pos_shape = tuple(axial_pos_shape)
self.axial_pos_embds_dim = tuple(axial_pos_embds_dim)
self.axial_norm_std = axial_norm_std
self.chunk_size_lm_head = chunk_size_lm_head
self.chunk_size_feed_forward = chunk_size_feed_forward
self.scope = scope
self.attn_layers = attn_layers
self.pad_token_id = pad_token_id
self.hash_seed = hash_seed
attn_chunk_length = local_attn_chunk_length if local_attn_chunk_length is not None else lsh_attn_chunk_length
num_chunks_after = local_num_chunks_after if local_num_chunks_after is not None else lsh_num_chunks_after
num_chunks_before = local_num_chunks_before if local_num_chunks_before is not None else lsh_num_chunks_before
self.encoder_seq_length = seq_length // attn_chunk_length + (self.seq_length % attn_chunk_length != 0)
self.key_length = (num_chunks_before + num_chunks_after + 1) * attn_chunk_length
self.chunk_length = attn_chunk_length
self.num_labels = num_labels
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
input_mask = None
if self.use_input_mask:
input_mask = random_attention_mask([self.batch_size, self.seq_length])
choice_labels = None
if self.use_labels:
choice_labels = ids_tensor([self.batch_size], 2)
config = ReformerConfig(
vocab_size=self.vocab_size,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
feed_forward_size=self.feed_forward_size,
hidden_act=self.hidden_act,
hidden_dropout_prob=self.hidden_dropout_prob,
local_attention_probs_dropout_prob=self.local_attention_probs_dropout_prob,
lsh_attention_probs_dropout_prob=self.lsh_attention_probs_dropout_prob,
max_position_embeddings=self.max_position_embeddings,
is_decoder=self.is_decoder,
axial_pos_embds=self.axial_pos_embds,
axial_pos_shape=self.axial_pos_shape,
axial_pos_embds_dim=self.axial_pos_embds_dim,
local_attn_chunk_length=self.local_attn_chunk_length,
local_num_chunks_after=self.local_num_chunks_after,
local_num_chunks_before=self.local_num_chunks_before,
num_hashes=self.num_hashes,
num_buckets=self.num_buckets,
lsh_attn_chunk_length=self.lsh_attn_chunk_length,
lsh_num_chunks_after=self.lsh_num_chunks_after,
lsh_num_chunks_before=self.lsh_num_chunks_before,
attn_layers=self.attn_layers,
pad_token_id=self.pad_token_id,
hash_seed=self.hash_seed,
return_dict=True,
)
return (
config,
input_ids,
input_mask,
choice_labels,
)
def create_and_check_reformer_model(self, config, input_ids, input_mask, choice_labels):
model = ReformerModel(config=config)
model.to(torch_device)
model.eval()
result = model(input_ids, attention_mask=input_mask)
result = model(input_ids)
# 2 * hidden_size because we use reversible resnet layers
self.parent.assertEqual(
result.last_hidden_state.shape, (self.batch_size, self.seq_length, 2 * self.hidden_size)
)
def create_and_check_reformer_model_with_lm_backward(self, config, input_ids, input_mask, choice_labels):
config.is_decoder = False
config.lsh_num_chunks_after = 1
model = ReformerForMaskedLM(config=config)
model.to(torch_device)
model.eval()
loss = model(input_ids, attention_mask=input_mask, labels=input_ids)["loss"]
loss.backward()
def create_and_check_reformer_with_lm(self, config, input_ids, input_mask, choice_labels):
config.lsh_num_chunks_after = 0
config.is_decoder = True
model = ReformerModelWithLMHead(config=config)
model.to(torch_device)
model.eval()
result = model(input_ids, attention_mask=input_mask, labels=input_ids)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
def create_and_check_reformer_with_mlm(self, config, input_ids, input_mask, choice_labels):
config.is_decoder = False
model = ReformerForMaskedLM(config=config)
model.to(torch_device)
model.eval()
result = model(input_ids, attention_mask=input_mask, labels=input_ids)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
def create_and_check_reformer_model_with_attn_mask(
self, config, input_ids, input_mask, choice_labels, is_decoder=False
):
# no special position embeddings
config.axial_pos_embds = False
config.is_decoder = is_decoder
if self.lsh_attn_chunk_length is not None:
# need to set chunk length equal sequence length to be certain that chunking works
config.lsh_attn_chunk_length = self.seq_length
model = ReformerModel(config=config)
model.to(torch_device)
model.eval()
# set all position encodings to zero so that postions don't matter
with torch.no_grad():
embedding = model.embeddings.position_embeddings.embedding
embedding.weight = torch.nn.Parameter(torch.zeros(embedding.weight.shape).to(torch_device))
embedding.weight.requires_grad = False
half_seq_len = self.seq_length // 2
roll = self.chunk_length
half_input_ids = input_ids[:, :half_seq_len]
# normal padded
attn_mask = torch.cat(
[torch.ones_like(half_input_ids), torch.zeros_like(half_input_ids)],
dim=-1,
)
input_ids_padded = torch.cat(
[half_input_ids, ids_tensor((self.batch_size, half_seq_len), self.vocab_size)],
dim=-1,
)
# shifted padded
input_ids_roll = torch.cat(
[half_input_ids, ids_tensor((self.batch_size, half_seq_len), self.vocab_size)],
dim=-1,
)
input_ids_roll = torch.roll(input_ids_roll, roll, dims=-1)
attn_mask_roll = torch.roll(attn_mask, roll, dims=-1)
output_padded = model(input_ids_padded, attention_mask=attn_mask)[0][:, :half_seq_len]
output_padded_rolled = model(input_ids_roll, attention_mask=attn_mask_roll)[0][:, roll : half_seq_len + roll]
self.parent.assertTrue(torch.allclose(output_padded, output_padded_rolled, atol=1e-3))
def create_and_check_reformer_layer_dropout_seed(
self, config, input_ids, input_mask, choice_labels, is_decoder=False
):
config.is_decoder = is_decoder
layer = ReformerLayer(config).to(torch_device)
layer.train()
shape = (
self.batch_size,
self.seq_length,
config.hidden_size,
) # Batch x SeqLen x hiddenSize
# get random tensors
hidden_states = floats_tensor(shape)
prev_attn_output = floats_tensor(shape)
# now the random seeds for attention and feed forward is initialized
# forward tensors with dropout
layer_outputs = layer(prev_attn_output, hidden_states, attention_mask=input_mask)
next_attn_output = layer_outputs.attn_output
next_hidden_states = layer_outputs.hidden_states
torch.manual_seed(layer.attention_seed)
attn_outputs = layer.attention(hidden_states, attention_mask=input_mask)
self.parent.assertTrue(
torch.allclose(
prev_attn_output + attn_outputs.hidden_states,
next_attn_output,
atol=1e-3,
)
)
torch.manual_seed(layer.feed_forward_seed)
feed_forward_hidden_states = layer.feed_forward(next_attn_output)
self.parent.assertTrue(
torch.allclose(
next_hidden_states,
hidden_states + feed_forward_hidden_states,
atol=1e-3,
)
)
def create_and_check_reformer_feed_backward_chunking(self, config, input_ids, input_mask, choice_labels):
if not self.is_training:
return
# disable dropout
config.hidden_dropout_prob = 0
config.local_attention_probs_dropout_prob = 0
config.lsh_attention_probs_dropout_prob = 0
config.lsh_num_chunks_after = 1
config.is_decoder = False
torch.manual_seed(0)
model = ReformerForMaskedLM(config=config)
model.to(torch_device)
model.train()
model.zero_grad()
loss_no_chunk, output_no_chunk = model(input_ids, labels=input_ids, attention_mask=input_mask)[:2]
loss_no_chunk.backward()
grad_slice_word_no_chunk = model.reformer.embeddings.word_embeddings.weight.grad[0, :5]
grad_slice_position_factor_1_no_chunk = model.reformer.embeddings.position_embeddings.weights[0][1, 0, -5:]
grad_slice_position_factor_2_no_chunk = model.reformer.embeddings.position_embeddings.weights[1][0, 1, :5]
config.chunk_size_lm_head = 1
config.chunk_size_feed_forward = 1
torch.manual_seed(0)
model = ReformerForMaskedLM(config=config)
model.to(torch_device)
model.train()
model.zero_grad()
loss_chunk, output_chunk = model(input_ids, labels=input_ids, attention_mask=input_mask)[:2]
loss_chunk.backward()
grad_slice_word_chunk = model.reformer.embeddings.word_embeddings.weight.grad[0, :5]
grad_slice_position_factor_1_chunk = model.reformer.embeddings.position_embeddings.weights[0][1, 0, -5:]
grad_slice_position_factor_2_chunk = model.reformer.embeddings.position_embeddings.weights[1][0, 1, :5]
self.parent.assertTrue(torch.allclose(loss_chunk, loss_no_chunk, atol=1e-3))
self.parent.assertTrue(torch.allclose(grad_slice_word_no_chunk, grad_slice_word_chunk, atol=1e-3))
self.parent.assertTrue(
torch.allclose(grad_slice_position_factor_1_chunk, grad_slice_position_factor_1_no_chunk, atol=1e-3)
)
self.parent.assertTrue(
torch.allclose(grad_slice_position_factor_2_chunk, grad_slice_position_factor_2_no_chunk, atol=1e-3)
)
def create_and_check_reformer_random_seed(self, config, input_ids, input_mask, choice_labels):
layer = ReformerLayer(config).to(torch_device)
layer.train()
shape = (
self.batch_size,
self.seq_length,
config.hidden_size,
) # Batch x SeqLen x hiddenSize
hidden_states = floats_tensor(shape)
attn_output = floats_tensor(shape)
seeds = []
for _ in range(100):
layer_outputs = layer(attn_output, hidden_states, attention_mask=input_mask)
attn_output = layer_outputs.attn_output
hidden_states = layer_outputs.hidden_states
torch.manual_seed(layer.attention_seed)
seeds.append(layer.attention_seed)
self.parent.assertGreater(len(set(seeds)), 70)
seeds = []
for _ in range(100):
layer_outputs = layer(attn_output, hidden_states, attention_mask=input_mask)
attn_output = layer_outputs.attn_output
hidden_states = layer_outputs.hidden_states
torch.manual_seed(layer.feed_forward_seed)
seeds.append(layer.feed_forward_seed)
self.parent.assertGreater(len(set(seeds)), 70)
def create_and_check_reformer_model_fp16_forward(self, config, input_ids, input_mask, choice_labels):
model = ReformerModel(config=config)
model.to(torch_device)
model.half()
model.eval()
output = model(input_ids, attention_mask=input_mask)["last_hidden_state"]
self.parent.assertFalse(torch.isnan(output).any().item())
def create_and_check_reformer_model_generate(self, config, input_ids, input_mask, choice_labels):
config.is_decoder = True
config.lsh_num_chunks_after = 0
config.bos_token_id = 0
config.eos_token_id = None
config.max_length = 20
model = ReformerModelWithLMHead(config=config)
model.to(torch_device)
model.eval()
output = model.generate()
self.parent.assertIsNotNone(output)
def create_and_check_reformer_model_fp16_generate(self, config, input_ids, input_mask, choice_labels):
config.is_decoder = True
config.lsh_num_chunks_after = 0
model = ReformerModelWithLMHead(config=config)
model.to(torch_device)
model.half()
model.eval()
# only use last 10 inputs for generation
output = model.generate(input_ids[:, -10:], attention_mask=input_mask, do_sample=False)
self.parent.assertFalse(torch.isnan(output).any().item())
def create_and_check_reformer_no_chunking(self, config, input_ids, input_mask, choice_labels):
# force chunk length to be bigger than input_ids
config.lsh_attn_chunk_length = 2 * input_ids.shape[-1]
config.local_attn_chunk_length = 2 * input_ids.shape[-1]
config.lsh_num_chunks_after = 1
config.is_decoder = False
model = ReformerForMaskedLM(config=config)
model.to(torch_device)
model.eval()
output_logits = model(input_ids, attention_mask=input_mask)["logits"]
self.parent.assertTrue(output_logits.shape[1] == input_ids.shape[-1])
def create_and_check_reformer_for_question_answering(self, config, input_ids, input_mask, choice_labels):
model = ReformerForQuestionAnswering(config=config)
model.to(torch_device)
model.eval()
result = model(
input_ids,
attention_mask=input_mask,
start_positions=choice_labels,
end_positions=choice_labels,
)
self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
def create_and_check_past_buckets_states(self, config, input_ids, input_mask, choice_labels):
config.is_decoder = True
config.lsh_num_chunks_before = 1
config.lsh_num_chunks_after = 0
model = ReformerModelWithLMHead(config=config)
model.to(torch_device)
model.eval()
input_ids_first = input_ids[:, :-1]
input_ids_second = input_ids[:, -1:]
# return saved cache
past_buckets_states = model(input_ids_first, use_cache=True)["past_buckets_states"]
# calculate last output with and without cache
outputs_with_cache = model(input_ids_second, past_buckets_states=past_buckets_states, use_cache=True)["logits"]
outputs_without_cache = model(input_ids)["logits"][:, -1]
# select random slice idx
random_slice_idx = torch.randint(outputs_without_cache.shape[-1], (1, 1), device=torch_device).item()
# outputs should be similar within range
self.parent.assertTrue(
torch.allclose(
outputs_with_cache[:, 0, random_slice_idx], outputs_without_cache[:, random_slice_idx], atol=1e-2
)
)
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(config, input_ids, input_mask, choice_labels) = config_and_inputs
inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
return config, inputs_dict
def create_and_check_reformer_for_sequence_classification(
self, config, input_ids, input_mask, choice_labels, is_decoder
):
config.is_decoder = is_decoder
sequence_labels = ids_tensor([self.batch_size], config.num_labels)
model = ReformerForSequenceClassification(config)
model.to(torch_device)
model.eval()
result = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
class ReformerTesterMixin:
"""
Reformer Local and Reformer LSH run essentially the same tests
"""
def test_config(self):
self.config_tester.run_common_tests()
def test_reformer_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_reformer_model(*config_and_inputs)
def test_reformer_lm_model_backward(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_reformer_model_with_lm_backward(*config_and_inputs)
def test_reformer_model_attn_masking(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_reformer_model_with_attn_mask(*config_and_inputs, is_decoder=True)
self.model_tester.create_and_check_reformer_model_with_attn_mask(*config_and_inputs, is_decoder=False)
def test_reformer_with_lm(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_reformer_with_lm(*config_and_inputs)
def test_reformer_with_mlm(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_reformer_with_mlm(*config_and_inputs)
def test_reformer_layer_training_dropout(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_reformer_layer_dropout_seed(*config_and_inputs, is_decoder=True)
self.model_tester.create_and_check_reformer_layer_dropout_seed(*config_and_inputs, is_decoder=False)
def test_reformer_chunking_backward_equality(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_reformer_feed_backward_chunking(*config_and_inputs)
def test_reformer_no_chunking(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_reformer_no_chunking(*config_and_inputs)
def test_reformer_qa_answering(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_reformer_for_question_answering(*config_and_inputs)
def test_reformer_cached_inference(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_past_buckets_states(*config_and_inputs)
def test_reformer_cached_generate(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_reformer_model_generate(*config_and_inputs)
@slow
def test_dropout_random_seed_is_changing(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_reformer_random_seed(*config_and_inputs)
@unittest.skipIf(torch_device == "cpu", "Cant do half precision")
def test_reformer_model_fp16_forward(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_reformer_model_fp16_forward(*config_and_inputs)
@unittest.skipIf(torch_device == "cpu", "Cant do half precision")
def test_reformer_model_fp16_generate(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_reformer_model_fp16_generate(*config_and_inputs)
@require_torch_multigpu
def test_multigpu_data_parallel_forward(self):
# Opt-out of this test.
pass
def test_for_sequence_classification(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_reformer_for_sequence_classification(*config_and_inputs, is_decoder=False)
@require_torch
class ReformerLocalAttnModelTest(ReformerTesterMixin, ModelTesterMixin, unittest.TestCase):
all_model_classes = (
(ReformerModel, ReformerModelWithLMHead, ReformerForSequenceClassification, ReformerForQuestionAnswering)
if is_torch_available()
else ()
)
all_generative_model_classes = (ReformerModelWithLMHead,) if is_torch_available() else ()
test_pruning = False
test_headmasking = False
test_torchscript = False
def prepare_kwargs(self):
return {
"batch_size": 13,
"seq_length": 32,
"is_training": True,
"is_decoder": True,
"use_input_mask": True,
"use_labels": True,
"vocab_size": 32,
"attention_head_size": 16,
"hidden_size": 32,
"num_attention_heads": 2,
"local_attn_chunk_length": 4,
"local_num_chunks_before": 1,
"local_num_chunks_after": 0,
"chunk_size_lm_head": 0,
"chunk_size_feed_forward": 0,
"feed_forward_size": 32,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"local_attention_probs_dropout_prob": 0.1,
"max_position_embeddings": 512,
"initializer_range": 0.02,
"axial_norm_std": 1.0,
"layer_norm_eps": 1e-12,
"axial_pos_embds": True,
"axial_pos_shape": [4, 8],
"axial_pos_embds_dim": [16, 16],
"attn_layers": ["local", "local", "local", "local"],
"pad_token_id": 0,
"eos_token_id": 2,
"scope": None,
"hash_seed": 0,
"num_labels": 2,
}
def setUp(self):
tester_kwargs = self.prepare_kwargs()
self.model_tester = ReformerModelTester(self, **tester_kwargs)
self.config_tester = ConfigTester(self, config_class=ReformerConfig, hidden_size=37)
@slow
def test_model_from_pretrained(self):
for model_name in REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
model = ReformerModelWithLMHead.from_pretrained(model_name)
self.assertIsNotNone(model)
@require_torch
class ReformerLSHAttnModelTest(ReformerTesterMixin, ModelTesterMixin, unittest.TestCase):
all_model_classes = (
(ReformerModel, ReformerModelWithLMHead, ReformerForSequenceClassification, ReformerForQuestionAnswering)
if is_torch_available()
else ()
)
all_generative_model_classes = (ReformerModelWithLMHead,) if is_torch_available() else ()
test_pruning = False
test_headmasking = False
test_torchscript = False
def prepare_kwargs(self):
return {
"batch_size": 13,
"seq_length": 13,
"use_input_mask": True,
"use_labels": True,
"is_training": False,
"is_decoder": True,
"vocab_size": 32,
"attention_head_size": 16,
"hidden_size": 64,
"num_attention_heads": 2,
"num_buckets": 2,
"num_hashes": 4,
"lsh_attn_chunk_length": 4,
"lsh_num_chunks_before": 1,
"lsh_num_chunks_after": 0,
"chunk_size_lm_head": 5,
"chunk_size_feed_forward": 6,
"feed_forward_size": 32,
"hidden_act": "relu",
"hidden_dropout_prob": 0.1,
"lsh_attention_probs_dropout_prob": 0.1,
"max_position_embeddings": 512,
"initializer_range": 0.02,
"axial_norm_std": 1.0,
"layer_norm_eps": 1e-12,
"axial_pos_embds": True,
"axial_pos_shape": [4, 8],
"axial_pos_embds_dim": [16, 48],
# sanotheu
# "attn_layers": ["lsh", "lsh", "lsh", "lsh"],
"attn_layers": ["lsh"],
"pad_token_id": 0,
"eos_token_id": 2,
"scope": None,
"hash_seed": 0,
"num_labels": 2,
}
def setUp(self):
tester_kwargs = self.prepare_kwargs()
self.model_tester = ReformerModelTester(self, **tester_kwargs)
self.config_tester = ConfigTester(self, config_class=ReformerConfig, hidden_size=37)
@require_torch
@require_sentencepiece
@require_tokenizers
class ReformerIntegrationTests(unittest.TestCase):
"""
These integration tests test the current layer activations and gradients againts the output of the Hugging Face Reformer model at time of integration: 29/06/2020. During integration, the model was tested against the output of the official Trax ReformerLM model for various cases ("lsh" only, "local" only, masked / non-masked, different chunk length, ....). In order to recover the original trax integration tests, one should use patrickvonplaten's fork of trax and the code that lives on the branch `reformer_trax_tests`.
"""
def _get_basic_config_and_input(self):
config = {
"vocab_size": 320,
"attention_head_size": 8,
"hidden_size": 16,
"num_attention_heads": 2,
"num_buckets": 2,
"num_hashes": 4,
"lsh_attn_chunk_length": 4,
"local_attn_chunk_length": 4,
"lsh_num_chunks_before": 1,
"lsh_num_chunks_after": 0,
"local_num_chunks_before": 1,
"local_num_chunks_after": 0,
"chunk_size_lm_head": 0,
"chunk_size_feed_forward": 0,
"feed_forward_size": 32,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.0,
"lsh_attention_probs_dropout_prob": 0.0,
"local_attention_probs_dropout_prob": 0.0,
"max_position_embeddings": 32,
"initializer_range": 0.02,
"axial_norm_std": 1.0,
"layer_norm_eps": 1e-12,
"sinusoidal_pos_embds": False,
"axial_pos_embds": True,
"axial_pos_shape": [4, 8],
"axial_pos_embds_dim": [8, 8],
"hash_seed": 0,
"is_decoder": True,
}
return config
def _get_hidden_states(self):
return torch.tensor(
[
[
[
1.90826353e00,
-1.45999730e00,
-6.20405462e-01,
1.52503433e00,
-3.64464232e-01,
-8.27359235e-01,
8.39670803e-01,
2.44492178e-01,
4.98332758e-01,
2.69175139e00,
-7.08081422e-03,
1.04915401e00,
-1.83476661e00,
7.67220476e-01,
2.98580543e-01,
2.84803992e-02,
],
[
-2.66374286e-02,
4.33497576e-01,
3.10386309e-01,
5.46039944e-01,
-2.47292666e-04,
-7.52305019e-01,
2.39162103e-01,
7.25216186e-01,
-7.58357372e-01,
4.20635998e-01,
-4.04739919e-02,
1.59924145e-01,
2.05135748e00,
-1.15997978e00,
5.37166397e-01,
2.62873606e-01,
],
[
1.85247482e-01,
7.07046037e-01,
-6.77089715e-01,
-2.24209655e00,
-3.75307980e-02,
-8.59380874e-01,
-2.81027884e00,
1.01276376e00,
-1.69438001e00,
4.17574660e-01,
-1.49196962e00,
-1.76483717e00,
-1.94566312e-01,
-1.71183858e00,
7.72903565e-01,
-1.11557056e00,
],
[
9.46069193e-01,
1.53417623e-01,
-9.58686996e-01,
1.18126669e-01,
1.75967724e00,
1.62194590e00,
-5.74108159e-01,
6.79920443e-01,
5.44028163e-01,
2.05466114e-01,
-3.63045868e-01,
2.41865062e-01,
3.20348382e-01,
-9.05611176e-01,
-1.92690727e-01,
-1.19917547e00,
],
]
],
dtype=torch.float32,
device=torch_device,
)
def _get_attn_mask(self):
return torch.tensor([[0, 1, 0, 0]], dtype=torch.long, device=torch_device)
def _get_input_ids_and_mask(self):
mask = torch.tensor(
[
[1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1],
[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0],
],
dtype=torch.long,
device=torch_device,
)
input_ids = torch.tensor(
[
[
89,
279,
286,
84,
194,
316,
182,
28,
283,
37,
169,
7,
253,
267,
107,
250,
44,
7,
102,
62,
3,
243,
171,
265,
302,
48,
164,
264,
148,
229,
280,
150,
],
[
9,
192,
66,
112,
163,
83,
135,
70,
224,
96,
31,
80,
196,
80,
63,
22,
85,
100,
47,
283,
0,
163,
126,
143,
195,
82,
53,
82,
18,
27,
182,
52,
],
],
dtype=torch.long,
device=torch_device,
)
return input_ids, mask
def test_lsh_layer_forward(self):
config = self._get_basic_config_and_input()
config["lsh_num_chunks_before"] = 0
config["attn_layers"] = ["lsh"]
config["is_decoder"] = False
hidden_states = self._get_hidden_states()
torch.manual_seed(0)
layer = ReformerLayer(ReformerConfig(**config)).to(torch_device)
layer.eval()
reformer_output = layer(prev_attn_output=hidden_states.clone(), hidden_states=hidden_states)
output_slice = reformer_output.hidden_states[0, 0, :5]
expected_output_slice = torch.tensor(
[1.6879, -1.3083, -0.4708, 1.3555, -0.6292],
dtype=torch.float,
device=torch_device,
)
self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
def test_lsh_layer_forward_complex(self):
config = self._get_basic_config_and_input()
config["lsh_num_chunks_before"] = 0
config["attn_layers"] = ["lsh"]
config["num_buckets"] = [2, 4]
attn_mask = self._get_attn_mask()
hidden_states = self._get_hidden_states()
torch.manual_seed(0)
layer = ReformerLayer(ReformerConfig(**config)).to(torch_device)
layer.eval()
reformer_output = layer(
prev_attn_output=hidden_states.clone(),
hidden_states=hidden_states,
attention_mask=attn_mask,
)
output_slice = reformer_output.hidden_states[0, 0, :5]
expected_output_slice = torch.tensor(
[1.6439, -1.2306, -0.5108, 1.3006, -0.6537],
dtype=torch.float,
device=torch_device,
)
self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
def test_local_layer_forward(self):
config = self._get_basic_config_and_input()
config["local_num_chunks_before"] = 0
config["attn_layers"] = ["local"]
config["is_decoder"] = False
hidden_states = self._get_hidden_states()
torch.manual_seed(0)
layer = ReformerLayer(ReformerConfig(**config)).to(torch_device)
layer.eval()
reformer_output = layer(prev_attn_output=hidden_states, hidden_states=hidden_states)
output_slice = reformer_output.hidden_states[0, 0, :5]
expected_output_slice = torch.tensor(
[1.4212, -2.0576, -0.9688, 1.4599, -0.1344],
dtype=torch.float,
device=torch_device,
)
self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
def test_local_layer_forward_complex(self):
config = self._get_basic_config_and_input()
config["local_num_chunks_before"] = 0
config["attn_layers"] = ["local"]
attn_mask = self._get_attn_mask()
hidden_states = self._get_hidden_states()
torch.manual_seed(0)
layer = ReformerLayer(ReformerConfig(**config)).to(torch_device)
layer.eval()
reformer_output = layer(
prev_attn_output=hidden_states,
hidden_states=hidden_states,
attention_mask=attn_mask,
)
output_slice = reformer_output.hidden_states[0, 0, :5]
expected_output_slice = torch.tensor(
[1.4750, -2.0235, -0.9743, 1.4463, -0.1269],
dtype=torch.float,
device=torch_device,
)
self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
def test_lsh_model_forward(self):
config = self._get_basic_config_and_input()
config["attn_layers"] = ["lsh", "lsh", "lsh", "lsh"]
config["num_buckets"] = [2, 4]
torch.manual_seed(0)
model = ReformerModel(ReformerConfig(**config)).to(torch_device)
model.eval()
input_ids, attn_mask = self._get_input_ids_and_mask()
hidden_states = model(input_ids=input_ids, attention_mask=attn_mask)[0]
output_slice = hidden_states[0, 0, :5]
expected_output_slice = torch.tensor(
[-0.9896, -0.9396, -1.0831, -0.0597, 0.2456],
dtype=torch.float,
device=torch_device,
)
self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
def test_local_model_forward(self):
config = self._get_basic_config_and_input()
config["attn_layers"] = ["local", "local", "local", "local"]
torch.manual_seed(0)
model = ReformerModel(ReformerConfig(**config)).to(torch_device)
model.eval()
input_ids, attn_mask = self._get_input_ids_and_mask()
hidden_states = model(input_ids=input_ids, attention_mask=attn_mask)[0]
output_slice = hidden_states[0, 0, :5]
expected_output_slice = torch.tensor(
[-1.6791, 0.7171, 0.1594, 0.4063, 1.2584],
dtype=torch.float,
device=torch_device,
)
self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
def test_lm_model_forward(self):
config = self._get_basic_config_and_input()
config["attn_layers"] = ["local", "lsh", "local", "lsh", "local", "lsh"]
config["num_buckets"] = [2, 4]
config["is_decoder"] = False
torch.manual_seed(0)
model = ReformerForMaskedLM(ReformerConfig(**config)).to(torch_device)
model.eval()
input_ids, attn_mask = self._get_input_ids_and_mask()
hidden_states = model(input_ids=input_ids, attention_mask=attn_mask)[0]
output_slice = hidden_states[1, -1, :5]
expected_output_slice = torch.tensor(
[0.0256, -0.0121, 0.0636, 0.0024, -0.0393],
dtype=torch.float,
device=torch_device,
)
self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
def test_local_lm_model_grad(self):
config = self._get_basic_config_and_input()
config["attn_layers"] = ["local", "local", "local", "local"]
config["hidden_dropout_prob"] = 0.0
config["local_attention_probs_dropout_prob"] = 0.0
torch.manual_seed(0)
model = ReformerModelWithLMHead(ReformerConfig(**config)).to(torch_device)
model.train()
model.zero_grad()
input_ids, _ = self._get_input_ids_and_mask()
loss = model(input_ids=input_ids, labels=input_ids)[0]
self.assertTrue(torch.allclose(loss, torch.tensor(5.7786, dtype=torch.float, device=torch_device), atol=1e-3))
loss.backward()
# check last grads to cover all proable errors
grad_slice_word = model.reformer.embeddings.word_embeddings.weight.grad[0, :5]
expected_grad_slice_word = torch.tensor(
[-0.0005, 0.0001, 0.0002, 0.0003, 0.0006],
dtype=torch.float,
device=torch_device,
)
grad_slice_position_factor_1 = model.reformer.embeddings.position_embeddings.weights[0][1, 0, -5:]
expected_grad_slice_pos_fac_1 = torch.tensor(
[0.0037, -1.3793, -1.0231, -1.5230, -2.5306],
dtype=torch.float,
device=torch_device,
)
grad_slice_position_factor_2 = model.reformer.embeddings.position_embeddings.weights[1][0, 1, :5]
expected_grad_slice_pos_fac_2 = torch.tensor(
[-1.3165, 0.5168, 0.7785, 1.0811, -0.9830],
dtype=torch.float,
device=torch_device,
)
self.assertTrue(torch.allclose(grad_slice_word, expected_grad_slice_word, atol=1e-3))
self.assertTrue(torch.allclose(grad_slice_position_factor_1, expected_grad_slice_pos_fac_1, atol=1e-3))
self.assertTrue(torch.allclose(grad_slice_position_factor_2, expected_grad_slice_pos_fac_2, atol=1e-3))
def test_lsh_lm_model_grad(self):
config = self._get_basic_config_and_input()
config["attn_layers"] = ["lsh", "lsh", "lsh", "lsh"]
config["hidden_dropout_prob"] = 0.0
config["lsh_attention_probs_dropout_prob"] = 0.0
config["num_buckets"] = [2, 4]
config["num_hashes"] = 6
torch.manual_seed(0)
model = ReformerModelWithLMHead(ReformerConfig(**config)).to(torch_device)
model.train()
model.zero_grad()
input_ids, _ = self._get_input_ids_and_mask()
loss = model(input_ids=input_ids, labels=input_ids)[0]
self.assertTrue(torch.allclose(loss, torch.tensor(5.7819, dtype=torch.float, device=torch_device), atol=1e-3))
loss.backward()
# check last grads to cover all proable errors
grad_slice_word = model.reformer.embeddings.word_embeddings.weight.grad[0, :5]
expected_grad_slice_word = torch.tensor(
[2.6357e-05, 4.3358e-04, -8.4985e-04, 1.0094e-04, 3.8954e-04],
dtype=torch.float,
device=torch_device,
)
grad_slice_position_factor_1 = model.reformer.embeddings.position_embeddings.weights[0][1, 0, -5:]
expected_grad_slice_pos_fac_1 = torch.tensor(
[-0.0984, 0.6283, 0.4282, 1.2960, 0.6897],
dtype=torch.float,
device=torch_device,
)
grad_slice_position_factor_2 = model.reformer.embeddings.position_embeddings.weights[1][0, 1, :5]
expected_grad_slice_pos_fac_2 = torch.tensor(
[0.4626, -0.0231, -0.0172, 0.1081, 0.3805],
dtype=torch.float,
device=torch_device,
)
self.assertTrue(torch.allclose(grad_slice_word, expected_grad_slice_word, atol=1e-3))
self.assertTrue(torch.allclose(grad_slice_position_factor_1, expected_grad_slice_pos_fac_1, atol=1e-3))
self.assertTrue(torch.allclose(grad_slice_position_factor_2, expected_grad_slice_pos_fac_2, atol=1e-3))
@slow
def test_pretrained_generate_crime_and_punish(self):
model = ReformerModelWithLMHead.from_pretrained("google/reformer-crime-and-punishment").to(torch_device)
tokenizer = ReformerTokenizer.from_pretrained("google/reformer-crime-and-punishment")
model.eval()
input_ids = tokenizer.encode("A few months later", return_tensors="pt").to(torch_device)
output_ids = model.generate(
input_ids, max_length=50, num_beams=4, early_stopping=True, do_sample=False, num_hashes=8
)
output = tokenizer.decode(output_ids[0])
self.assertEqual(
output,
"A few months later state expression in his ideas, at the first entrance. He was positively for an inst",
)
@slow
def test_pretrained_generate_use_cache_equality(self):
model = ReformerModelWithLMHead.from_pretrained("google/reformer-crime-and-punishment").to(torch_device)
tokenizer = ReformerTokenizer.from_pretrained("google/reformer-crime-and-punishment")
model.eval()
input_ids = tokenizer.encode("A few months later", return_tensors="pt").to(torch_device)
output_ids_with_cache = model.generate(input_ids, max_length=130, num_hashes=8, use_cache=False)
output_ids_without_cache = model.generate(input_ids, max_length=130, num_hashes=8, use_cache=True)
output_with_cache = tokenizer.decode(output_ids_with_cache[0])
output_without_cache = tokenizer.decode(output_ids_without_cache[0])
self.assertEqual(output_with_cache, output_without_cache)