CI with `num_hidden_layers=2` 🚀🚀🚀 (#25266)
* CI with layers=2 --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
parent
b28ebb2655
commit
bd90cda9a6
|
@ -54,8 +54,9 @@ class AlbertModelTester:
|
|||
vocab_size=99,
|
||||
embedding_size=16,
|
||||
hidden_size=36,
|
||||
num_hidden_layers=6,
|
||||
num_hidden_groups=6,
|
||||
num_hidden_layers=2,
|
||||
# this needs to be the same as `num_hidden_layers`!
|
||||
num_hidden_groups=2,
|
||||
num_attention_heads=6,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -48,7 +48,7 @@ class FlaxAlbertModelTester(unittest.TestCase):
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -242,7 +242,7 @@ class AlignTextModelTester:
|
|||
use_token_type_ids=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -60,7 +60,7 @@ class AltCLIPVisionModelTester:
|
|||
is_training=True,
|
||||
hidden_size=32,
|
||||
projection_dim=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
dropout=0.1,
|
||||
|
@ -212,7 +212,7 @@ class AltCLIPTextModelTester:
|
|||
hidden_size=32,
|
||||
projection_dim=32,
|
||||
project_dim=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
dropout=0.1,
|
||||
|
|
|
@ -55,7 +55,7 @@ class ASTModelTester:
|
|||
is_training=True,
|
||||
use_labels=True,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -1289,7 +1289,7 @@ class BartStandaloneDecoderModelTester:
|
|||
use_labels=True,
|
||||
decoder_start_token_id=2,
|
||||
decoder_ffn_dim=32,
|
||||
decoder_layers=4,
|
||||
decoder_layers=2,
|
||||
encoder_attention_heads=4,
|
||||
decoder_attention_heads=4,
|
||||
max_position_embeddings=30,
|
||||
|
|
|
@ -64,7 +64,7 @@ class BeitModelTester:
|
|||
is_training=True,
|
||||
use_labels=True,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=4,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -48,7 +48,7 @@ class FlaxBeitModelTester(unittest.TestCase):
|
|||
is_training=True,
|
||||
use_labels=True,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -57,7 +57,7 @@ class BertModelTester:
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -47,7 +47,7 @@ class FlaxBertModelTester(unittest.TestCase):
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -41,7 +41,7 @@ class BertGenerationEncoderTester:
|
|||
use_input_mask=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -605,7 +605,7 @@ class BigBirdPegasusStandaloneDecoderModelTester:
|
|||
use_labels=True,
|
||||
decoder_start_token_id=2,
|
||||
decoder_ffn_dim=32,
|
||||
decoder_layers=4,
|
||||
decoder_layers=2,
|
||||
encoder_attention_heads=4,
|
||||
decoder_attention_heads=4,
|
||||
max_position_embeddings=30,
|
||||
|
|
|
@ -51,7 +51,7 @@ class BioGptModelTester:
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -356,7 +356,7 @@ class BlenderbotStandaloneDecoderModelTester:
|
|||
use_labels=True,
|
||||
decoder_start_token_id=2,
|
||||
decoder_ffn_dim=32,
|
||||
decoder_layers=4,
|
||||
decoder_layers=2,
|
||||
encoder_attention_heads=4,
|
||||
decoder_attention_heads=4,
|
||||
max_position_embeddings=30,
|
||||
|
|
|
@ -365,7 +365,7 @@ class BlenderbotSmallStandaloneDecoderModelTester:
|
|||
use_labels=True,
|
||||
decoder_start_token_id=2,
|
||||
decoder_ffn_dim=32,
|
||||
decoder_layers=4,
|
||||
decoder_layers=2,
|
||||
encoder_attention_heads=4,
|
||||
decoder_attention_heads=4,
|
||||
max_position_embeddings=30,
|
||||
|
|
|
@ -70,7 +70,7 @@ class BlipVisionModelTester:
|
|||
is_training=True,
|
||||
hidden_size=32,
|
||||
projection_dim=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
dropout=0.1,
|
||||
|
@ -221,7 +221,7 @@ class BlipTextModelTester:
|
|||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
projection_dim=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
dropout=0.1,
|
||||
|
|
|
@ -44,7 +44,7 @@ class BlipTextModelTester:
|
|||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
projection_dim=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
dropout=0.1,
|
||||
|
|
|
@ -62,7 +62,7 @@ class Blip2VisionModelTester:
|
|||
is_training=True,
|
||||
hidden_size=32,
|
||||
projection_dim=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
dropout=0.1,
|
||||
|
@ -215,7 +215,7 @@ class Blip2QFormerModelTester:
|
|||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
projection_dim=32,
|
||||
num_hidden_layers=6,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
dropout=0.1,
|
||||
|
@ -289,7 +289,7 @@ class Blip2TextModelDecoderOnlyTester:
|
|||
use_labels=False,
|
||||
vocab_size=99,
|
||||
hidden_size=16,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=4,
|
||||
hidden_act="gelu",
|
||||
|
@ -503,7 +503,7 @@ class Blip2TextModelTester:
|
|||
use_attention_mask=True,
|
||||
use_labels=True,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
d_ff=37,
|
||||
relative_attention_num_buckets=8,
|
||||
|
|
|
@ -54,7 +54,7 @@ class BloomModelTester:
|
|||
use_mc_token_ids=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -53,7 +53,7 @@ class CanineModelTester:
|
|||
# NOTE: this is not a model parameter, just an input
|
||||
vocab_size=100000,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -69,7 +69,7 @@ class ChineseCLIPTextModelTester:
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
@ -246,7 +246,7 @@ class ChineseCLIPVisionModelTester:
|
|||
is_training=True,
|
||||
hidden_size=32,
|
||||
projection_dim=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
dropout=0.1,
|
||||
|
|
|
@ -287,7 +287,7 @@ class ClapTextModelTester:
|
|||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
projection_dim=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
dropout=0.1,
|
||||
|
|
|
@ -86,7 +86,7 @@ class CLIPVisionModelTester:
|
|||
is_training=True,
|
||||
hidden_size=32,
|
||||
projection_dim=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
dropout=0.1,
|
||||
|
@ -261,7 +261,7 @@ class CLIPTextModelTester:
|
|||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
projection_dim=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
dropout=0.1,
|
||||
|
|
|
@ -35,7 +35,7 @@ class FlaxCLIPVisionModelTester:
|
|||
num_channels=3,
|
||||
is_training=True,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
dropout=0.1,
|
||||
|
@ -252,7 +252,7 @@ class FlaxCLIPTextModelTester:
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
dropout=0.1,
|
||||
|
|
|
@ -78,7 +78,7 @@ class CLIPSegVisionModelTester:
|
|||
num_channels=3,
|
||||
is_training=True,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
dropout=0.1,
|
||||
|
@ -228,7 +228,7 @@ class CLIPSegTextModelTester:
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
dropout=0.1,
|
||||
|
@ -346,7 +346,15 @@ class CLIPSegTextModelTest(ModelTesterMixin, unittest.TestCase):
|
|||
|
||||
|
||||
class CLIPSegModelTester:
|
||||
def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
text_kwargs=None,
|
||||
vision_kwargs=None,
|
||||
is_training=True,
|
||||
# This should respect the `num_hidden_layers` in `CLIPSegVisionModelTester`
|
||||
extract_layers=(1,),
|
||||
):
|
||||
if text_kwargs is None:
|
||||
text_kwargs = {}
|
||||
if vision_kwargs is None:
|
||||
|
@ -356,6 +364,7 @@ class CLIPSegModelTester:
|
|||
self.text_model_tester = CLIPSegTextModelTester(parent, **text_kwargs)
|
||||
self.vision_model_tester = CLIPSegVisionModelTester(parent, **vision_kwargs)
|
||||
self.is_training = is_training
|
||||
self.extract_layers = extract_layers
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
|
||||
|
@ -371,7 +380,7 @@ class CLIPSegModelTester:
|
|||
self.vision_model_tester.get_config(),
|
||||
projection_dim=64,
|
||||
reduce_dim=32,
|
||||
extract_layers=[1, 2, 3],
|
||||
extract_layers=self.extract_layers,
|
||||
)
|
||||
|
||||
def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
|
||||
|
|
|
@ -47,7 +47,7 @@ class CodeGenModelTester:
|
|||
vocab_size=256,
|
||||
hidden_size=32,
|
||||
rotary_dim=4,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -53,7 +53,7 @@ class ConvBertModelTester:
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -49,7 +49,7 @@ class CpmAntModelTester:
|
|||
use_mc_token_ids=False,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=3,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
num_buckets=32,
|
||||
|
|
|
@ -49,7 +49,7 @@ class CTRLModelTester:
|
|||
use_mc_token_ids=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -59,7 +59,7 @@ class Data2VecAudioModelTester:
|
|||
conv_bias=False,
|
||||
num_conv_pos_embeddings=16,
|
||||
num_conv_pos_embedding_groups=2,
|
||||
num_hidden_layers=4,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=2,
|
||||
hidden_dropout_prob=0.1,
|
||||
intermediate_size=20,
|
||||
|
|
|
@ -57,7 +57,7 @@ class Data2VecTextModelTester:
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -59,7 +59,7 @@ class Data2VecVisionModelTester:
|
|||
is_training=True,
|
||||
use_labels=True,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=4,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -47,7 +47,7 @@ class DebertaModelTester(object):
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -48,7 +48,7 @@ class DebertaV2ModelTester(object):
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -69,7 +69,7 @@ class DeiTModelTester:
|
|||
is_training=True,
|
||||
use_labels=True,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -57,7 +57,7 @@ class Dinov2ModelTester:
|
|||
is_training=True,
|
||||
use_labels=True,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -50,7 +50,7 @@ class DistilBertModelTester(object):
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -47,7 +47,7 @@ class FlaxDistilBertModelTester(unittest.TestCase):
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -48,7 +48,7 @@ class DPRModelTester:
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -53,7 +53,7 @@ class DPTModelTester:
|
|||
is_training=True,
|
||||
use_labels=True,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=4,
|
||||
num_hidden_layers=2,
|
||||
backbone_out_indices=[0, 1, 2, 3],
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
|
@ -62,7 +62,7 @@ class DPTModelTester:
|
|||
attention_probs_dropout_prob=0.1,
|
||||
initializer_range=0.02,
|
||||
num_labels=3,
|
||||
neck_hidden_sizes=[16, 16, 32, 32],
|
||||
neck_hidden_sizes=[16, 32],
|
||||
is_hybrid=False,
|
||||
scope=None,
|
||||
):
|
||||
|
|
|
@ -54,7 +54,7 @@ class ElectraModelTester:
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -34,7 +34,7 @@ class FlaxElectraModelTester(unittest.TestCase):
|
|||
vocab_size=99,
|
||||
embedding_size=24,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -56,7 +56,7 @@ class ErnieModelTester:
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -50,7 +50,7 @@ class ErnieMModelTester:
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -49,7 +49,7 @@ class EsmModelTester:
|
|||
use_labels=True,
|
||||
vocab_size=33,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -43,7 +43,7 @@ class EsmFoldModelTester:
|
|||
use_labels=False,
|
||||
vocab_size=19,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -50,7 +50,7 @@ class FalconModelTester:
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -57,7 +57,7 @@ class FlaubertModelTester(object):
|
|||
vocab_size=99,
|
||||
n_special=0,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
hidden_dropout_prob=0.1,
|
||||
attention_probs_dropout_prob=0.1,
|
||||
|
|
|
@ -79,7 +79,7 @@ class FlavaImageModelTester:
|
|||
parent,
|
||||
batch_size=12,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
@ -342,7 +342,7 @@ class FlavaTextModelTester:
|
|||
max_position_embeddings=512,
|
||||
position_embedding_type="absolute",
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
@ -487,7 +487,7 @@ class FlavaMultimodalModelTester:
|
|||
seq_length=44,
|
||||
use_input_mask=True,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -70,7 +70,7 @@ class FNetModelTester:
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
hidden_dropout_prob=0.1,
|
||||
|
|
|
@ -51,7 +51,7 @@ class GitVisionModelTester:
|
|||
is_training=True,
|
||||
hidden_size=32,
|
||||
projection_dim=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
dropout=0.1,
|
||||
|
@ -203,7 +203,7 @@ class GitModelTester:
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=4,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -52,7 +52,7 @@ class FlaxGPT2ModelTester:
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -56,7 +56,7 @@ class GPT2ModelTester:
|
|||
use_mc_token_ids=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -55,7 +55,7 @@ class GPTBigCodeModelTester:
|
|||
use_mc_token_ids=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="relu",
|
||||
|
|
|
@ -52,9 +52,9 @@ class FlaxGPTNeoModelTester:
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=4,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
attention_types=[[["global", "local"], 2]],
|
||||
attention_types=[[["global", "local"], 1]],
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
hidden_dropout_prob=0.1,
|
||||
|
|
|
@ -54,8 +54,8 @@ class GPTNeoModelTester:
|
|||
use_mc_token_ids=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=4,
|
||||
attention_types=[[["global", "local"], 2]],
|
||||
num_hidden_layers=2,
|
||||
attention_types=[[["global", "local"], 1]],
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -52,7 +52,7 @@ class GPTNeoXModelTester:
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=64,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -44,7 +44,7 @@ class GPTNeoXJapaneseModelTester:
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_multiple_size=4,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -53,7 +53,7 @@ class FlaxGPTJModelTester:
|
|||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
rotary_dim=4,
|
||||
num_hidden_layers=4,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -56,7 +56,7 @@ class GPTJModelTester:
|
|||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
rotary_dim=4,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -45,7 +45,7 @@ class GPTSanJapaneseTester:
|
|||
is_training=True,
|
||||
hidden_size=32,
|
||||
ext_size=42,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_ext_layers=2,
|
||||
num_attention_heads=4,
|
||||
num_experts=2,
|
||||
|
|
|
@ -356,7 +356,7 @@ class GroupViTTextModelTester:
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
dropout=0.1,
|
||||
|
@ -553,6 +553,10 @@ class GroupViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
|
|||
def test_model_common_attributes(self):
|
||||
pass
|
||||
|
||||
# overwritten from parent as this equivalent test needs a specific `seed` and hard to get a good one!
|
||||
def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=2e-5, name="outputs", attributes=None):
|
||||
super().check_pt_tf_outputs(tf_outputs, pt_outputs, model_class, tol=tol, name=name, attributes=attributes)
|
||||
|
||||
@is_pt_tf_cross_test
|
||||
def test_pt_tf_model_equivalence(self):
|
||||
import tensorflow as tf
|
||||
|
|
|
@ -71,7 +71,7 @@ class HubertModelTester:
|
|||
conv_bias=False,
|
||||
num_conv_pos_embeddings=16,
|
||||
num_conv_pos_embedding_groups=2,
|
||||
num_hidden_layers=4,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=2,
|
||||
hidden_dropout_prob=0.1, # this is most likely not correctly set yet
|
||||
intermediate_size=20,
|
||||
|
|
|
@ -62,7 +62,7 @@ class IBertModelTester:
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -65,7 +65,7 @@ class ImageGPTModelTester:
|
|||
use_mc_token_ids=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -64,7 +64,7 @@ class InstructBlipVisionModelTester:
|
|||
is_training=True,
|
||||
hidden_size=32,
|
||||
projection_dim=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
dropout=0.1,
|
||||
|
@ -219,7 +219,7 @@ class InstructBlipQFormerModelTester:
|
|||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
projection_dim=32,
|
||||
num_hidden_layers=6,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
dropout=0.1,
|
||||
|
@ -295,7 +295,7 @@ class InstructBlipTextModelDecoderOnlyTester:
|
|||
use_labels=False,
|
||||
vocab_size=99,
|
||||
hidden_size=16,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=4,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -48,7 +48,7 @@ class LayoutLMModelTester:
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -55,7 +55,7 @@ class LayoutLMv2ModelTester:
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=36,
|
||||
num_hidden_layers=3,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -63,7 +63,7 @@ class LayoutLMv3ModelTester:
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=36,
|
||||
num_hidden_layers=3,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -46,7 +46,7 @@ class LlamaModelTester:
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -50,7 +50,7 @@ class LongformerModelTester:
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -71,7 +71,7 @@ class FlaxLongT5ModelTester:
|
|||
use_attention_mask=True,
|
||||
use_labels=True,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
d_ff=37,
|
||||
relative_attention_num_buckets=8,
|
||||
|
|
|
@ -59,7 +59,7 @@ class LongT5ModelTester:
|
|||
use_attention_mask=True,
|
||||
use_labels=True,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
d_ff=37,
|
||||
relative_attention_num_buckets=8,
|
||||
|
@ -916,7 +916,7 @@ class LongT5EncoderOnlyModelTester:
|
|||
# For common tests
|
||||
use_attention_mask=True,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
d_ff=37,
|
||||
relative_attention_num_buckets=8,
|
||||
|
|
|
@ -61,7 +61,7 @@ class LukeModelTester:
|
|||
entity_vocab_size=10,
|
||||
entity_emb_size=6,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -661,7 +661,7 @@ class MarianStandaloneDecoderModelTester:
|
|||
use_labels=True,
|
||||
decoder_start_token_id=2,
|
||||
decoder_ffn_dim=32,
|
||||
decoder_layers=4,
|
||||
decoder_layers=2,
|
||||
encoder_attention_heads=4,
|
||||
decoder_attention_heads=4,
|
||||
max_position_embeddings=30,
|
||||
|
|
|
@ -53,7 +53,7 @@ class MarkupLMModelTester:
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -491,7 +491,7 @@ class MBartStandaloneDecoderModelTester:
|
|||
use_labels=True,
|
||||
decoder_start_token_id=2,
|
||||
decoder_ffn_dim=32,
|
||||
decoder_layers=4,
|
||||
decoder_layers=2,
|
||||
encoder_attention_heads=4,
|
||||
decoder_attention_heads=4,
|
||||
max_position_embeddings=30,
|
||||
|
|
|
@ -51,7 +51,7 @@ class MegaModelTester:
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
intermediate_size=37,
|
||||
hidden_dropout_prob=0.1,
|
||||
attention_probs_dropout_prob=0.1,
|
||||
|
|
|
@ -58,7 +58,7 @@ class MegatronBertModelTester:
|
|||
vocab_size=99,
|
||||
hidden_size=64,
|
||||
embedding_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -55,7 +55,7 @@ class MgpstrModelTester:
|
|||
num_bpe_labels=99,
|
||||
num_wordpiece_labels=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
mlp_ratio=4.0,
|
||||
patch_embeds_hidden_size=257,
|
||||
|
|
|
@ -54,7 +54,7 @@ class MobileBertModelTester:
|
|||
vocab_size=99,
|
||||
hidden_size=64,
|
||||
embedding_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -49,7 +49,7 @@ class MPNetModelTester:
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=64,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=64,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -54,7 +54,7 @@ class MptModelTester:
|
|||
use_mc_token_ids=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -51,7 +51,7 @@ class MraModelTester:
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=16,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=2,
|
||||
intermediate_size=36,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -595,7 +595,7 @@ class MvpStandaloneDecoderModelTester:
|
|||
use_labels=True,
|
||||
decoder_start_token_id=2,
|
||||
decoder_ffn_dim=32,
|
||||
decoder_layers=4,
|
||||
decoder_layers=2,
|
||||
encoder_attention_heads=4,
|
||||
decoder_attention_heads=4,
|
||||
max_position_embeddings=30,
|
||||
|
|
|
@ -55,7 +55,7 @@ class NezhaModelTester:
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -52,7 +52,7 @@ class NllbMoeModelTester:
|
|||
use_labels=False,
|
||||
vocab_size=99,
|
||||
hidden_size=16,
|
||||
num_hidden_layers=4,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=4,
|
||||
hidden_act="relu",
|
||||
|
|
|
@ -51,7 +51,7 @@ class NystromformerModelTester:
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -49,7 +49,7 @@ class OpenAIGPTModelTester:
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -70,7 +70,7 @@ class OPTModelTester:
|
|||
use_labels=False,
|
||||
vocab_size=99,
|
||||
hidden_size=16,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=4,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -62,7 +62,7 @@ class OwlViTVisionModelTester:
|
|||
num_channels=3,
|
||||
is_training=True,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
dropout=0.1,
|
||||
|
|
|
@ -52,7 +52,7 @@ class FlaxPegasusModelTester:
|
|||
use_labels=False,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_dropout_prob=0.1,
|
||||
|
|
|
@ -371,7 +371,7 @@ class PegasusStandaloneDecoderModelTester:
|
|||
use_labels=True,
|
||||
decoder_start_token_id=2,
|
||||
decoder_ffn_dim=32,
|
||||
decoder_layers=4,
|
||||
decoder_layers=2,
|
||||
encoder_attention_heads=4,
|
||||
decoder_attention_heads=4,
|
||||
max_position_embeddings=30,
|
||||
|
|
|
@ -670,7 +670,7 @@ class PegasusXStandaloneDecoderModelTester:
|
|||
use_labels=True,
|
||||
decoder_start_token_id=2,
|
||||
decoder_ffn_dim=32,
|
||||
decoder_layers=4,
|
||||
decoder_layers=2,
|
||||
encoder_attention_heads=4,
|
||||
decoder_attention_heads=4,
|
||||
max_position_embeddings=30,
|
||||
|
|
|
@ -71,7 +71,7 @@ class Pix2StructVisionModelTester:
|
|||
patch_embed_hidden_size=12,
|
||||
projection_dim=32,
|
||||
max_patches=64,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
dropout=0.1,
|
||||
|
@ -230,7 +230,7 @@ class Pix2StructTextModelTester:
|
|||
vocab_size=99,
|
||||
hidden_size=12,
|
||||
projection_dim=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
dropout=0.1,
|
||||
|
|
|
@ -473,7 +473,7 @@ class PLBartStandaloneDecoderModelTester:
|
|||
use_labels=True,
|
||||
decoder_start_token_id=2,
|
||||
decoder_ffn_dim=32,
|
||||
decoder_layers=4,
|
||||
decoder_layers=2,
|
||||
encoder_attention_heads=4,
|
||||
decoder_attention_heads=4,
|
||||
max_position_embeddings=30,
|
||||
|
|
|
@ -55,10 +55,10 @@ class ProphetNetModelTester:
|
|||
use_labels=True,
|
||||
decoder_start_token_id=0,
|
||||
encoder_ffn_dim=32,
|
||||
num_encoder_layers=4,
|
||||
num_encoder_layers=2,
|
||||
num_encoder_attention_heads=4,
|
||||
decoder_ffn_dim=32,
|
||||
num_decoder_layers=4,
|
||||
num_decoder_layers=2,
|
||||
num_decoder_attention_heads=4,
|
||||
max_position_embeddings=30,
|
||||
is_encoder_decoder=True,
|
||||
|
@ -437,10 +437,10 @@ class ProphetNetModelTester:
|
|||
decoder_attention_mask=decoder_attention_mask,
|
||||
labels=lm_labels,
|
||||
)
|
||||
self.parent.assertTrue(torch.allclose(result.loss, torch.tensor(4.5981, device=torch_device), atol=1e-3))
|
||||
self.parent.assertTrue(torch.allclose(result.loss, torch.tensor(4.5892, device=torch_device), atol=1e-3))
|
||||
|
||||
expected_logit_slice = torch.tensor(
|
||||
[-0.0648, 0.0790, 0.0360, 0.0089, 0.0039, -0.0639, 0.0131], device=torch_device
|
||||
[-0.0184, 0.0758, -0.0543, -0.0093, 0.0050, -0.0660, -0.1453], device=torch_device
|
||||
)
|
||||
self.parent.assertTrue(torch.allclose(result.logits[0, :, 1], expected_logit_slice, atol=1e-3))
|
||||
|
||||
|
@ -551,10 +551,10 @@ class ProphetNetStandaloneDecoderModelTester:
|
|||
use_labels=True,
|
||||
decoder_start_token_id=0,
|
||||
encoder_ffn_dim=32,
|
||||
num_encoder_layers=4,
|
||||
num_encoder_layers=2,
|
||||
num_encoder_attention_heads=4,
|
||||
decoder_ffn_dim=32,
|
||||
num_decoder_layers=4,
|
||||
num_decoder_layers=2,
|
||||
num_decoder_attention_heads=4,
|
||||
max_position_embeddings=30,
|
||||
is_encoder_decoder=False,
|
||||
|
@ -782,10 +782,10 @@ class ProphetNetStandaloneEncoderModelTester:
|
|||
use_labels=True,
|
||||
decoder_start_token_id=0,
|
||||
encoder_ffn_dim=32,
|
||||
num_encoder_layers=4,
|
||||
num_encoder_layers=2,
|
||||
num_encoder_attention_heads=4,
|
||||
decoder_ffn_dim=32,
|
||||
num_decoder_layers=4,
|
||||
num_decoder_layers=2,
|
||||
num_decoder_attention_heads=4,
|
||||
max_position_embeddings=30,
|
||||
is_encoder_decoder=False,
|
||||
|
|
|
@ -54,7 +54,7 @@ class QDQBertModelTester:
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -54,7 +54,7 @@ class RealmModelTester:
|
|||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
|
@ -55,7 +55,7 @@ class RemBertModelTester:
|
|||
hidden_size=32,
|
||||
input_embedding_size=18,
|
||||
output_embedding_size=43,
|
||||
num_hidden_layers=5,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue