Improve BERT-like models performance with better self attention (#9124)
* Improve BERT-like models attention layers * Apply style * Put back error raising instead of assert * Update template * Fix copies * Apply raising valueerror in MPNet * Restore the copy check for the Intermediate layer in Longformer * Update longformer
This commit is contained in:
parent
6b034309ca
commit
5a8a4eb187
4
setup.py
4
setup.py
|
@ -127,8 +127,8 @@ _deps = [
|
|||
"sphinx-rtd-theme==0.4.3", # sphinx-rtd-theme==0.5.0 introduced big changes in the style.
|
||||
"sphinx==3.2.1",
|
||||
"starlette",
|
||||
"tensorflow-cpu>=2.0",
|
||||
"tensorflow>=2.0",
|
||||
"tensorflow-cpu>=2.3",
|
||||
"tensorflow>=2.3",
|
||||
"timeout-decorator",
|
||||
"tokenizers==0.9.4",
|
||||
"torch>=1.0",
|
||||
|
|
|
@ -40,8 +40,8 @@ deps = {
|
|||
"sphinx-rtd-theme": "sphinx-rtd-theme==0.4.3",
|
||||
"sphinx": "sphinx==3.2.1",
|
||||
"starlette": "starlette",
|
||||
"tensorflow-cpu": "tensorflow-cpu>=2.0",
|
||||
"tensorflow": "tensorflow>=2.0",
|
||||
"tensorflow-cpu": "tensorflow-cpu>=2.3",
|
||||
"tensorflow": "tensorflow>=2.3",
|
||||
"timeout-decorator": "timeout-decorator",
|
||||
"tokenizers": "tokenizers==0.9.4",
|
||||
"torch": "torch>=1.0",
|
||||
|
|
|
@ -246,52 +246,52 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
|
|||
|
||||
if config.hidden_size % config.num_attention_heads != 0:
|
||||
raise ValueError(
|
||||
"The hidden size (%d) is not a multiple of the number of attention "
|
||||
"heads (%d)" % (config.hidden_size, config.num_attention_heads)
|
||||
f"The hidden size ({config.hidden_size}) is not a multiple of the number "
|
||||
f"of attention heads ({config.num_attention_heads})"
|
||||
)
|
||||
|
||||
self.num_attention_heads = config.num_attention_heads
|
||||
assert config.hidden_size % config.num_attention_heads == 0
|
||||
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
|
||||
self.all_head_size = self.num_attention_heads * self.attention_head_size
|
||||
self.query = tf.keras.layers.Dense(
|
||||
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
|
||||
self.query = tf.keras.layers.experimental.EinsumDense(
|
||||
equation="abc,cde->abde",
|
||||
output_shape=(None, config.num_attention_heads, self.attention_head_size),
|
||||
bias_axes="de",
|
||||
kernel_initializer=get_initializer(initializer_range=config.initializer_range),
|
||||
name="query",
|
||||
)
|
||||
self.key = tf.keras.layers.Dense(
|
||||
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
|
||||
self.key = tf.keras.layers.experimental.EinsumDense(
|
||||
equation="abc,cde->abde",
|
||||
output_shape=(None, config.num_attention_heads, self.attention_head_size),
|
||||
bias_axes="de",
|
||||
kernel_initializer=get_initializer(initializer_range=config.initializer_range),
|
||||
name="key",
|
||||
)
|
||||
self.value = tf.keras.layers.Dense(
|
||||
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
|
||||
self.value = tf.keras.layers.experimental.EinsumDense(
|
||||
equation="abc,cde->abde",
|
||||
output_shape=(None, config.num_attention_heads, self.attention_head_size),
|
||||
bias_axes="de",
|
||||
kernel_initializer=get_initializer(initializer_range=config.initializer_range),
|
||||
name="value",
|
||||
)
|
||||
self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob)
|
||||
self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
|
||||
|
||||
def transpose_for_scores(self, x, batch_size):
|
||||
x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size))
|
||||
def call(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False, training=False):
|
||||
query_layer = self.query(inputs=hidden_states)
|
||||
key_layer = self.key(inputs=hidden_states)
|
||||
value_layer = self.value(inputs=hidden_states)
|
||||
|
||||
return tf.transpose(x, perm=[0, 2, 1, 3])
|
||||
|
||||
def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False):
|
||||
batch_size = shape_list(hidden_states)[0]
|
||||
mixed_query_layer = self.query(hidden_states)
|
||||
mixed_key_layer = self.key(hidden_states)
|
||||
mixed_value_layer = self.value(hidden_states)
|
||||
query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
|
||||
key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
|
||||
value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
|
||||
|
||||
# Take the dot product between "query" and "key" to get the raw attention scores.
|
||||
attention_scores = tf.matmul(
|
||||
query_layer, key_layer, transpose_b=True
|
||||
) # (batch size, num_heads, seq_len_q, seq_len_k)
|
||||
dk = tf.cast(shape_list(key_layer)[-1], attention_scores.dtype) # scale attention_scores
|
||||
attention_scores = attention_scores / tf.math.sqrt(dk)
|
||||
# Take the dot product between "query" and "key" to get the raw
|
||||
# attention scores.
|
||||
dk = tf.cast(x=self.attention_head_size, dtype=query_layer.dtype)
|
||||
query_layer = tf.multiply(x=query_layer, y=tf.math.rsqrt(x=dk))
|
||||
attention_scores = tf.einsum("aecd,abcd->acbe", key_layer, query_layer)
|
||||
|
||||
if attention_mask is not None:
|
||||
# Apply the attention mask is (precomputed for all layers in TFBertModel call() function)
|
||||
attention_scores = attention_scores + attention_mask
|
||||
|
||||
# Normalize the attention scores to probabilities.
|
||||
attention_probs = tf.nn.softmax(attention_scores, axis=-1)
|
||||
attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1)
|
||||
|
||||
# This is actually dropping out entire tokens to attend to, which might
|
||||
# seem a bit unusual, but is taken from the original Transformer paper.
|
||||
|
@ -299,14 +299,10 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
|
|||
|
||||
# Mask heads if we want to
|
||||
if head_mask is not None:
|
||||
attention_probs = attention_probs * head_mask
|
||||
attention_scores = attention_scores * head_mask
|
||||
|
||||
context_layer = tf.matmul(attention_probs, value_layer)
|
||||
context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
|
||||
context_layer = tf.reshape(
|
||||
context_layer, (batch_size, -1, self.all_head_size)
|
||||
) # (batch_size, seq_len_q, all_head_size)
|
||||
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
|
||||
attention_output = tf.einsum("acbe,aecd->abcd", attention_probs, value_layer)
|
||||
outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)
|
||||
|
||||
return outputs
|
||||
|
||||
|
@ -315,16 +311,29 @@ class TFBertSelfOutput(tf.keras.layers.Layer):
|
|||
def __init__(self, config, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.dense = tf.keras.layers.Dense(
|
||||
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
|
||||
if config.hidden_size % config.num_attention_heads != 0:
|
||||
raise ValueError(
|
||||
f"The hidden size ({config.hidden_size}) is not a multiple of the number "
|
||||
f"of attention heads ({config.num_attention_heads})"
|
||||
)
|
||||
|
||||
self.num_attention_heads = config.num_attention_heads
|
||||
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
|
||||
self.all_head_size = self.num_attention_heads * self.attention_head_size
|
||||
self.dense = tf.keras.layers.experimental.EinsumDense(
|
||||
equation="abcd,cde->abe",
|
||||
output_shape=(None, self.all_head_size),
|
||||
bias_axes="e",
|
||||
kernel_initializer=get_initializer(initializer_range=config.initializer_range),
|
||||
name="dense",
|
||||
)
|
||||
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
|
||||
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
|
||||
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
|
||||
|
||||
def call(self, hidden_states, input_tensor, training=False):
|
||||
hidden_states = self.dense(hidden_states)
|
||||
hidden_states = self.dropout(hidden_states, training=training)
|
||||
hidden_states = self.LayerNorm(hidden_states + input_tensor)
|
||||
hidden_states = self.dense(inputs=hidden_states)
|
||||
hidden_states = self.dropout(inputs=hidden_states, training=training)
|
||||
hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
@ -353,18 +362,22 @@ class TFBertIntermediate(tf.keras.layers.Layer):
|
|||
def __init__(self, config, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.dense = tf.keras.layers.Dense(
|
||||
config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
|
||||
self.dense = tf.keras.layers.experimental.EinsumDense(
|
||||
equation="abc,cd->abd",
|
||||
output_shape=(None, config.intermediate_size),
|
||||
bias_axes="d",
|
||||
kernel_initializer=get_initializer(initializer_range=config.initializer_range),
|
||||
name="dense",
|
||||
)
|
||||
|
||||
if isinstance(config.hidden_act, str):
|
||||
self.intermediate_act_fn = get_tf_activation(config.hidden_act)
|
||||
self.intermediate_act_fn = get_tf_activation(activation_string=config.hidden_act)
|
||||
else:
|
||||
self.intermediate_act_fn = config.hidden_act
|
||||
|
||||
def call(self, hidden_states):
|
||||
hidden_states = self.dense(hidden_states)
|
||||
hidden_states = self.intermediate_act_fn(hidden_states)
|
||||
hidden_states = self.dense(inputs=hidden_states)
|
||||
hidden_states = self.intermediate_act_fn(inputs=hidden_states)
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
@ -373,16 +386,20 @@ class TFBertOutput(tf.keras.layers.Layer):
|
|||
def __init__(self, config, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.dense = tf.keras.layers.Dense(
|
||||
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
|
||||
self.dense = tf.keras.layers.experimental.EinsumDense(
|
||||
equation="abc,cd->abd",
|
||||
bias_axes="d",
|
||||
output_shape=(None, config.hidden_size),
|
||||
kernel_initializer=get_initializer(config.initializer_range),
|
||||
name="dense",
|
||||
)
|
||||
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
|
||||
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
|
||||
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
|
||||
|
||||
def call(self, hidden_states, input_tensor, training=False):
|
||||
hidden_states = self.dense(hidden_states)
|
||||
hidden_states = self.dropout(hidden_states, training=training)
|
||||
hidden_states = self.LayerNorm(hidden_states + input_tensor)
|
||||
hidden_states = self.dense(inputs=hidden_states)
|
||||
hidden_states = self.dropout(inputs=hidden_states, training=training)
|
||||
hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
|
|
@ -69,59 +69,59 @@ TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [
|
|||
]
|
||||
|
||||
|
||||
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention
|
||||
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Electra
|
||||
class TFElectraSelfAttention(tf.keras.layers.Layer):
|
||||
def __init__(self, config, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
if config.hidden_size % config.num_attention_heads != 0:
|
||||
raise ValueError(
|
||||
"The hidden size (%d) is not a multiple of the number of attention "
|
||||
"heads (%d)" % (config.hidden_size, config.num_attention_heads)
|
||||
f"The hidden size ({config.hidden_size}) is not a multiple of the number "
|
||||
f"of attention heads ({config.num_attention_heads})"
|
||||
)
|
||||
|
||||
self.num_attention_heads = config.num_attention_heads
|
||||
assert config.hidden_size % config.num_attention_heads == 0
|
||||
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
|
||||
self.all_head_size = self.num_attention_heads * self.attention_head_size
|
||||
self.query = tf.keras.layers.Dense(
|
||||
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
|
||||
self.query = tf.keras.layers.experimental.EinsumDense(
|
||||
equation="abc,cde->abde",
|
||||
output_shape=(None, config.num_attention_heads, self.attention_head_size),
|
||||
bias_axes="de",
|
||||
kernel_initializer=get_initializer(initializer_range=config.initializer_range),
|
||||
name="query",
|
||||
)
|
||||
self.key = tf.keras.layers.Dense(
|
||||
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
|
||||
self.key = tf.keras.layers.experimental.EinsumDense(
|
||||
equation="abc,cde->abde",
|
||||
output_shape=(None, config.num_attention_heads, self.attention_head_size),
|
||||
bias_axes="de",
|
||||
kernel_initializer=get_initializer(initializer_range=config.initializer_range),
|
||||
name="key",
|
||||
)
|
||||
self.value = tf.keras.layers.Dense(
|
||||
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
|
||||
self.value = tf.keras.layers.experimental.EinsumDense(
|
||||
equation="abc,cde->abde",
|
||||
output_shape=(None, config.num_attention_heads, self.attention_head_size),
|
||||
bias_axes="de",
|
||||
kernel_initializer=get_initializer(initializer_range=config.initializer_range),
|
||||
name="value",
|
||||
)
|
||||
self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob)
|
||||
self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
|
||||
|
||||
def transpose_for_scores(self, x, batch_size):
|
||||
x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size))
|
||||
def call(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False, training=False):
|
||||
query_layer = self.query(inputs=hidden_states)
|
||||
key_layer = self.key(inputs=hidden_states)
|
||||
value_layer = self.value(inputs=hidden_states)
|
||||
|
||||
return tf.transpose(x, perm=[0, 2, 1, 3])
|
||||
|
||||
def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False):
|
||||
batch_size = shape_list(hidden_states)[0]
|
||||
mixed_query_layer = self.query(hidden_states)
|
||||
mixed_key_layer = self.key(hidden_states)
|
||||
mixed_value_layer = self.value(hidden_states)
|
||||
query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
|
||||
key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
|
||||
value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
|
||||
|
||||
# Take the dot product between "query" and "key" to get the raw attention scores.
|
||||
attention_scores = tf.matmul(
|
||||
query_layer, key_layer, transpose_b=True
|
||||
) # (batch size, num_heads, seq_len_q, seq_len_k)
|
||||
dk = tf.cast(shape_list(key_layer)[-1], attention_scores.dtype) # scale attention_scores
|
||||
attention_scores = attention_scores / tf.math.sqrt(dk)
|
||||
# Take the dot product between "query" and "key" to get the raw
|
||||
# attention scores.
|
||||
dk = tf.cast(x=self.attention_head_size, dtype=query_layer.dtype)
|
||||
query_layer = tf.multiply(x=query_layer, y=tf.math.rsqrt(x=dk))
|
||||
attention_scores = tf.einsum("aecd,abcd->acbe", key_layer, query_layer)
|
||||
|
||||
if attention_mask is not None:
|
||||
# Apply the attention mask is (precomputed for all layers in TFBertModel call() function)
|
||||
# Apply the attention mask is (precomputed for all layers in TFElectraModel call() function)
|
||||
attention_scores = attention_scores + attention_mask
|
||||
|
||||
# Normalize the attention scores to probabilities.
|
||||
attention_probs = tf.nn.softmax(attention_scores, axis=-1)
|
||||
attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1)
|
||||
|
||||
# This is actually dropping out entire tokens to attend to, which might
|
||||
# seem a bit unusual, but is taken from the original Transformer paper.
|
||||
|
@ -129,33 +129,42 @@ class TFElectraSelfAttention(tf.keras.layers.Layer):
|
|||
|
||||
# Mask heads if we want to
|
||||
if head_mask is not None:
|
||||
attention_probs = attention_probs * head_mask
|
||||
attention_scores = attention_scores * head_mask
|
||||
|
||||
context_layer = tf.matmul(attention_probs, value_layer)
|
||||
context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
|
||||
context_layer = tf.reshape(
|
||||
context_layer, (batch_size, -1, self.all_head_size)
|
||||
) # (batch_size, seq_len_q, all_head_size)
|
||||
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
|
||||
attention_output = tf.einsum("acbe,aecd->abcd", attention_probs, value_layer)
|
||||
outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)
|
||||
|
||||
return outputs
|
||||
|
||||
|
||||
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput
|
||||
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Electra
|
||||
class TFElectraSelfOutput(tf.keras.layers.Layer):
|
||||
def __init__(self, config, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.dense = tf.keras.layers.Dense(
|
||||
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
|
||||
if config.hidden_size % config.num_attention_heads != 0:
|
||||
raise ValueError(
|
||||
f"The hidden size ({config.hidden_size}) is not a multiple of the number "
|
||||
f"of attention heads ({config.num_attention_heads})"
|
||||
)
|
||||
|
||||
self.num_attention_heads = config.num_attention_heads
|
||||
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
|
||||
self.all_head_size = self.num_attention_heads * self.attention_head_size
|
||||
self.dense = tf.keras.layers.experimental.EinsumDense(
|
||||
equation="abcd,cde->abe",
|
||||
output_shape=(None, self.all_head_size),
|
||||
bias_axes="e",
|
||||
kernel_initializer=get_initializer(initializer_range=config.initializer_range),
|
||||
name="dense",
|
||||
)
|
||||
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
|
||||
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
|
||||
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
|
||||
|
||||
def call(self, hidden_states, input_tensor, training=False):
|
||||
hidden_states = self.dense(hidden_states)
|
||||
hidden_states = self.dropout(hidden_states, training=training)
|
||||
hidden_states = self.LayerNorm(hidden_states + input_tensor)
|
||||
hidden_states = self.dense(inputs=hidden_states)
|
||||
hidden_states = self.dropout(inputs=hidden_states, training=training)
|
||||
hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
@ -186,18 +195,22 @@ class TFElectraIntermediate(tf.keras.layers.Layer):
|
|||
def __init__(self, config, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.dense = tf.keras.layers.Dense(
|
||||
config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
|
||||
self.dense = tf.keras.layers.experimental.EinsumDense(
|
||||
equation="abc,cd->abd",
|
||||
output_shape=(None, config.intermediate_size),
|
||||
bias_axes="d",
|
||||
kernel_initializer=get_initializer(initializer_range=config.initializer_range),
|
||||
name="dense",
|
||||
)
|
||||
|
||||
if isinstance(config.hidden_act, str):
|
||||
self.intermediate_act_fn = get_tf_activation(config.hidden_act)
|
||||
self.intermediate_act_fn = get_tf_activation(activation_string=config.hidden_act)
|
||||
else:
|
||||
self.intermediate_act_fn = config.hidden_act
|
||||
|
||||
def call(self, hidden_states):
|
||||
hidden_states = self.dense(hidden_states)
|
||||
hidden_states = self.intermediate_act_fn(hidden_states)
|
||||
hidden_states = self.dense(inputs=hidden_states)
|
||||
hidden_states = self.intermediate_act_fn(inputs=hidden_states)
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
@ -207,16 +220,20 @@ class TFElectraOutput(tf.keras.layers.Layer):
|
|||
def __init__(self, config, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.dense = tf.keras.layers.Dense(
|
||||
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
|
||||
self.dense = tf.keras.layers.experimental.EinsumDense(
|
||||
equation="abc,cd->abd",
|
||||
bias_axes="d",
|
||||
output_shape=(None, config.hidden_size),
|
||||
kernel_initializer=get_initializer(config.initializer_range),
|
||||
name="dense",
|
||||
)
|
||||
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
|
||||
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
|
||||
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
|
||||
|
||||
def call(self, hidden_states, input_tensor, training=False):
|
||||
hidden_states = self.dense(hidden_states)
|
||||
hidden_states = self.dropout(hidden_states, training=training)
|
||||
hidden_states = self.LayerNorm(hidden_states + input_tensor)
|
||||
hidden_states = self.dense(inputs=hidden_states)
|
||||
hidden_states = self.dropout(inputs=hidden_states, training=training)
|
||||
hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
|
|
@ -618,18 +618,22 @@ class TFLongformerIntermediate(tf.keras.layers.Layer):
|
|||
def __init__(self, config, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.dense = tf.keras.layers.Dense(
|
||||
config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
|
||||
self.dense = tf.keras.layers.experimental.EinsumDense(
|
||||
equation="abc,cd->abd",
|
||||
output_shape=(None, config.intermediate_size),
|
||||
bias_axes="d",
|
||||
kernel_initializer=get_initializer(initializer_range=config.initializer_range),
|
||||
name="dense",
|
||||
)
|
||||
|
||||
if isinstance(config.hidden_act, str):
|
||||
self.intermediate_act_fn = get_tf_activation(config.hidden_act)
|
||||
self.intermediate_act_fn = get_tf_activation(activation_string=config.hidden_act)
|
||||
else:
|
||||
self.intermediate_act_fn = config.hidden_act
|
||||
|
||||
def call(self, hidden_states):
|
||||
hidden_states = self.dense(hidden_states)
|
||||
hidden_states = self.intermediate_act_fn(hidden_states)
|
||||
hidden_states = self.dense(inputs=hidden_states)
|
||||
hidden_states = self.intermediate_act_fn(inputs=hidden_states)
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
@ -639,16 +643,20 @@ class TFLongformerOutput(tf.keras.layers.Layer):
|
|||
def __init__(self, config, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.dense = tf.keras.layers.Dense(
|
||||
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
|
||||
self.dense = tf.keras.layers.experimental.EinsumDense(
|
||||
equation="abc,cd->abd",
|
||||
bias_axes="d",
|
||||
output_shape=(None, config.hidden_size),
|
||||
kernel_initializer=get_initializer(config.initializer_range),
|
||||
name="dense",
|
||||
)
|
||||
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
|
||||
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
|
||||
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
|
||||
|
||||
def call(self, hidden_states, input_tensor, training=False):
|
||||
hidden_states = self.dense(hidden_states)
|
||||
hidden_states = self.dropout(hidden_states, training=training)
|
||||
hidden_states = self.LayerNorm(hidden_states + input_tensor)
|
||||
hidden_states = self.dense(inputs=hidden_states)
|
||||
hidden_states = self.dropout(inputs=hidden_states, training=training)
|
||||
hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
@ -674,7 +682,6 @@ class TFLongformerPooler(tf.keras.layers.Layer):
|
|||
return pooled_output
|
||||
|
||||
|
||||
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput
|
||||
class TFLongformerSelfOutput(tf.keras.layers.Layer):
|
||||
def __init__(self, config, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
|
|
@ -239,54 +239,58 @@ class TFMPNetSelfAttention(tf.keras.layers.Layer):
|
|||
|
||||
if config.hidden_size % config.num_attention_heads != 0:
|
||||
raise ValueError(
|
||||
"The hidden size (%d) is not a multiple of the number of attention "
|
||||
"heads (%d)" % (config.hidden_size, config.num_attention_heads)
|
||||
f"The hidden size ({config.hidden_size}) is not a multiple of the number "
|
||||
f"of attention heads ({config.num_attention_heads})"
|
||||
)
|
||||
|
||||
self.num_attention_heads = config.num_attention_heads
|
||||
assert config.hidden_size % config.num_attention_heads == 0
|
||||
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
|
||||
self.all_head_size = self.num_attention_heads * self.attention_head_size
|
||||
|
||||
self.q = tf.keras.layers.Dense(
|
||||
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="q"
|
||||
self.q = tf.keras.layers.experimental.EinsumDense(
|
||||
equation="abc,cde->abde",
|
||||
output_shape=(None, config.num_attention_heads, self.attention_head_size),
|
||||
bias_axes="de",
|
||||
kernel_initializer=get_initializer(initializer_range=config.initializer_range),
|
||||
name="q",
|
||||
)
|
||||
self.k = tf.keras.layers.Dense(
|
||||
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="k"
|
||||
self.k = tf.keras.layers.experimental.EinsumDense(
|
||||
equation="abc,cde->abde",
|
||||
output_shape=(None, config.num_attention_heads, self.attention_head_size),
|
||||
bias_axes="de",
|
||||
kernel_initializer=get_initializer(initializer_range=config.initializer_range),
|
||||
name="k",
|
||||
)
|
||||
self.v = tf.keras.layers.Dense(
|
||||
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="v"
|
||||
self.v = tf.keras.layers.experimental.EinsumDense(
|
||||
equation="abc,cde->abde",
|
||||
output_shape=(None, config.num_attention_heads, self.attention_head_size),
|
||||
bias_axes="de",
|
||||
kernel_initializer=get_initializer(initializer_range=config.initializer_range),
|
||||
name="v",
|
||||
)
|
||||
self.o = tf.keras.layers.Dense(
|
||||
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="o"
|
||||
self.o = tf.keras.layers.experimental.EinsumDense(
|
||||
equation="abcd,cde->abe",
|
||||
output_shape=(None, self.all_head_size),
|
||||
bias_axes="e",
|
||||
kernel_initializer=get_initializer(initializer_range=config.initializer_range),
|
||||
name="o",
|
||||
)
|
||||
self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob)
|
||||
|
||||
def transpose_for_scores(self, x, batch_size):
|
||||
x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size))
|
||||
|
||||
return tf.transpose(x, perm=[0, 2, 1, 3])
|
||||
|
||||
def call(self, hidden_states, attention_mask, head_mask, output_attentions, position_bias=None, training=False):
|
||||
batch_size = shape_list(hidden_states)[0]
|
||||
|
||||
q = self.q(hidden_states)
|
||||
k = self.k(hidden_states)
|
||||
v = self.v(hidden_states)
|
||||
|
||||
q = self.transpose_for_scores(q, batch_size)
|
||||
k = self.transpose_for_scores(k, batch_size)
|
||||
v = self.transpose_for_scores(v, batch_size)
|
||||
|
||||
attention_scores = tf.matmul(q, k, transpose_b=True)
|
||||
dk = tf.cast(shape_list(k)[-1], attention_scores.dtype)
|
||||
attention_scores = attention_scores / tf.math.sqrt(dk)
|
||||
dk = tf.cast(x=self.attention_head_size, dtype=q.dtype)
|
||||
q = tf.multiply(x=q, y=tf.math.rsqrt(x=dk))
|
||||
attention_scores = tf.einsum("aecd,abcd->acbe", k, q)
|
||||
|
||||
# Apply relative position embedding (precomputed in MPNetEncoder) if provided.
|
||||
if position_bias is not None:
|
||||
attention_scores += position_bias
|
||||
|
||||
if attention_mask is not None:
|
||||
# Apply the attention mask is (precomputed for all layers in TFMPNetModel call() function)
|
||||
attention_scores = attention_scores + attention_mask
|
||||
|
||||
attention_probs = tf.nn.softmax(attention_scores, axis=-1)
|
||||
|
@ -296,9 +300,7 @@ class TFMPNetSelfAttention(tf.keras.layers.Layer):
|
|||
if head_mask is not None:
|
||||
attention_probs = attention_probs * head_mask
|
||||
|
||||
c = tf.matmul(attention_probs, v)
|
||||
c = tf.transpose(c, perm=[0, 2, 1, 3])
|
||||
c = tf.reshape(c, (batch_size, -1, self.all_head_size))
|
||||
c = tf.einsum("acbe,aecd->abcd", attention_probs, v)
|
||||
o = self.o(c)
|
||||
|
||||
outputs = (o, attention_probs) if output_attentions else (o,)
|
||||
|
@ -330,18 +332,22 @@ class TFMPNetIntermediate(tf.keras.layers.Layer):
|
|||
def __init__(self, config, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.dense = tf.keras.layers.Dense(
|
||||
config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
|
||||
self.dense = tf.keras.layers.experimental.EinsumDense(
|
||||
equation="abc,cd->abd",
|
||||
output_shape=(None, config.intermediate_size),
|
||||
bias_axes="d",
|
||||
kernel_initializer=get_initializer(initializer_range=config.initializer_range),
|
||||
name="dense",
|
||||
)
|
||||
|
||||
if isinstance(config.hidden_act, str):
|
||||
self.intermediate_act_fn = get_tf_activation(config.hidden_act)
|
||||
self.intermediate_act_fn = get_tf_activation(activation_string=config.hidden_act)
|
||||
else:
|
||||
self.intermediate_act_fn = config.hidden_act
|
||||
|
||||
def call(self, hidden_states):
|
||||
hidden_states = self.dense(hidden_states)
|
||||
hidden_states = self.intermediate_act_fn(hidden_states)
|
||||
hidden_states = self.dense(inputs=hidden_states)
|
||||
hidden_states = self.intermediate_act_fn(inputs=hidden_states)
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
@ -351,16 +357,20 @@ class TFMPNetOutput(tf.keras.layers.Layer):
|
|||
def __init__(self, config, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.dense = tf.keras.layers.Dense(
|
||||
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
|
||||
self.dense = tf.keras.layers.experimental.EinsumDense(
|
||||
equation="abc,cd->abd",
|
||||
bias_axes="d",
|
||||
output_shape=(None, config.hidden_size),
|
||||
kernel_initializer=get_initializer(config.initializer_range),
|
||||
name="dense",
|
||||
)
|
||||
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
|
||||
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
|
||||
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
|
||||
|
||||
def call(self, hidden_states, input_tensor, training=False):
|
||||
hidden_states = self.dense(hidden_states)
|
||||
hidden_states = self.dropout(hidden_states, training=training)
|
||||
hidden_states = self.LayerNorm(hidden_states + input_tensor)
|
||||
hidden_states = self.dense(inputs=hidden_states)
|
||||
hidden_states = self.dropout(inputs=hidden_states, training=training)
|
||||
hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
|
|
@ -243,59 +243,59 @@ class TFRobertaPooler(tf.keras.layers.Layer):
|
|||
return pooled_output
|
||||
|
||||
|
||||
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention
|
||||
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Roberta
|
||||
class TFRobertaSelfAttention(tf.keras.layers.Layer):
|
||||
def __init__(self, config, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
if config.hidden_size % config.num_attention_heads != 0:
|
||||
raise ValueError(
|
||||
"The hidden size (%d) is not a multiple of the number of attention "
|
||||
"heads (%d)" % (config.hidden_size, config.num_attention_heads)
|
||||
f"The hidden size ({config.hidden_size}) is not a multiple of the number "
|
||||
f"of attention heads ({config.num_attention_heads})"
|
||||
)
|
||||
|
||||
self.num_attention_heads = config.num_attention_heads
|
||||
assert config.hidden_size % config.num_attention_heads == 0
|
||||
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
|
||||
self.all_head_size = self.num_attention_heads * self.attention_head_size
|
||||
self.query = tf.keras.layers.Dense(
|
||||
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
|
||||
self.query = tf.keras.layers.experimental.EinsumDense(
|
||||
equation="abc,cde->abde",
|
||||
output_shape=(None, config.num_attention_heads, self.attention_head_size),
|
||||
bias_axes="de",
|
||||
kernel_initializer=get_initializer(initializer_range=config.initializer_range),
|
||||
name="query",
|
||||
)
|
||||
self.key = tf.keras.layers.Dense(
|
||||
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
|
||||
self.key = tf.keras.layers.experimental.EinsumDense(
|
||||
equation="abc,cde->abde",
|
||||
output_shape=(None, config.num_attention_heads, self.attention_head_size),
|
||||
bias_axes="de",
|
||||
kernel_initializer=get_initializer(initializer_range=config.initializer_range),
|
||||
name="key",
|
||||
)
|
||||
self.value = tf.keras.layers.Dense(
|
||||
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
|
||||
self.value = tf.keras.layers.experimental.EinsumDense(
|
||||
equation="abc,cde->abde",
|
||||
output_shape=(None, config.num_attention_heads, self.attention_head_size),
|
||||
bias_axes="de",
|
||||
kernel_initializer=get_initializer(initializer_range=config.initializer_range),
|
||||
name="value",
|
||||
)
|
||||
self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob)
|
||||
self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
|
||||
|
||||
def transpose_for_scores(self, x, batch_size):
|
||||
x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size))
|
||||
def call(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False, training=False):
|
||||
query_layer = self.query(inputs=hidden_states)
|
||||
key_layer = self.key(inputs=hidden_states)
|
||||
value_layer = self.value(inputs=hidden_states)
|
||||
|
||||
return tf.transpose(x, perm=[0, 2, 1, 3])
|
||||
|
||||
def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False):
|
||||
batch_size = shape_list(hidden_states)[0]
|
||||
mixed_query_layer = self.query(hidden_states)
|
||||
mixed_key_layer = self.key(hidden_states)
|
||||
mixed_value_layer = self.value(hidden_states)
|
||||
query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
|
||||
key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
|
||||
value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
|
||||
|
||||
# Take the dot product between "query" and "key" to get the raw attention scores.
|
||||
attention_scores = tf.matmul(
|
||||
query_layer, key_layer, transpose_b=True
|
||||
) # (batch size, num_heads, seq_len_q, seq_len_k)
|
||||
dk = tf.cast(shape_list(key_layer)[-1], attention_scores.dtype) # scale attention_scores
|
||||
attention_scores = attention_scores / tf.math.sqrt(dk)
|
||||
# Take the dot product between "query" and "key" to get the raw
|
||||
# attention scores.
|
||||
dk = tf.cast(x=self.attention_head_size, dtype=query_layer.dtype)
|
||||
query_layer = tf.multiply(x=query_layer, y=tf.math.rsqrt(x=dk))
|
||||
attention_scores = tf.einsum("aecd,abcd->acbe", key_layer, query_layer)
|
||||
|
||||
if attention_mask is not None:
|
||||
# Apply the attention mask is (precomputed for all layers in TFBertModel call() function)
|
||||
# Apply the attention mask is (precomputed for all layers in TFRobertaModel call() function)
|
||||
attention_scores = attention_scores + attention_mask
|
||||
|
||||
# Normalize the attention scores to probabilities.
|
||||
attention_probs = tf.nn.softmax(attention_scores, axis=-1)
|
||||
attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1)
|
||||
|
||||
# This is actually dropping out entire tokens to attend to, which might
|
||||
# seem a bit unusual, but is taken from the original Transformer paper.
|
||||
|
@ -303,33 +303,42 @@ class TFRobertaSelfAttention(tf.keras.layers.Layer):
|
|||
|
||||
# Mask heads if we want to
|
||||
if head_mask is not None:
|
||||
attention_probs = attention_probs * head_mask
|
||||
attention_scores = attention_scores * head_mask
|
||||
|
||||
context_layer = tf.matmul(attention_probs, value_layer)
|
||||
context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
|
||||
context_layer = tf.reshape(
|
||||
context_layer, (batch_size, -1, self.all_head_size)
|
||||
) # (batch_size, seq_len_q, all_head_size)
|
||||
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
|
||||
attention_output = tf.einsum("acbe,aecd->abcd", attention_probs, value_layer)
|
||||
outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)
|
||||
|
||||
return outputs
|
||||
|
||||
|
||||
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput
|
||||
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Roberta
|
||||
class TFRobertaSelfOutput(tf.keras.layers.Layer):
|
||||
def __init__(self, config, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.dense = tf.keras.layers.Dense(
|
||||
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
|
||||
if config.hidden_size % config.num_attention_heads != 0:
|
||||
raise ValueError(
|
||||
f"The hidden size ({config.hidden_size}) is not a multiple of the number "
|
||||
f"of attention heads ({config.num_attention_heads})"
|
||||
)
|
||||
|
||||
self.num_attention_heads = config.num_attention_heads
|
||||
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
|
||||
self.all_head_size = self.num_attention_heads * self.attention_head_size
|
||||
self.dense = tf.keras.layers.experimental.EinsumDense(
|
||||
equation="abcd,cde->abe",
|
||||
output_shape=(None, self.all_head_size),
|
||||
bias_axes="e",
|
||||
kernel_initializer=get_initializer(initializer_range=config.initializer_range),
|
||||
name="dense",
|
||||
)
|
||||
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
|
||||
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
|
||||
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
|
||||
|
||||
def call(self, hidden_states, input_tensor, training=False):
|
||||
hidden_states = self.dense(hidden_states)
|
||||
hidden_states = self.dropout(hidden_states, training=training)
|
||||
hidden_states = self.LayerNorm(hidden_states + input_tensor)
|
||||
hidden_states = self.dense(inputs=hidden_states)
|
||||
hidden_states = self.dropout(inputs=hidden_states, training=training)
|
||||
hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
@ -360,18 +369,22 @@ class TFRobertaIntermediate(tf.keras.layers.Layer):
|
|||
def __init__(self, config, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.dense = tf.keras.layers.Dense(
|
||||
config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
|
||||
self.dense = tf.keras.layers.experimental.EinsumDense(
|
||||
equation="abc,cd->abd",
|
||||
output_shape=(None, config.intermediate_size),
|
||||
bias_axes="d",
|
||||
kernel_initializer=get_initializer(initializer_range=config.initializer_range),
|
||||
name="dense",
|
||||
)
|
||||
|
||||
if isinstance(config.hidden_act, str):
|
||||
self.intermediate_act_fn = get_tf_activation(config.hidden_act)
|
||||
self.intermediate_act_fn = get_tf_activation(activation_string=config.hidden_act)
|
||||
else:
|
||||
self.intermediate_act_fn = config.hidden_act
|
||||
|
||||
def call(self, hidden_states):
|
||||
hidden_states = self.dense(hidden_states)
|
||||
hidden_states = self.intermediate_act_fn(hidden_states)
|
||||
hidden_states = self.dense(inputs=hidden_states)
|
||||
hidden_states = self.intermediate_act_fn(inputs=hidden_states)
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
@ -381,16 +394,20 @@ class TFRobertaOutput(tf.keras.layers.Layer):
|
|||
def __init__(self, config, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.dense = tf.keras.layers.Dense(
|
||||
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
|
||||
self.dense = tf.keras.layers.experimental.EinsumDense(
|
||||
equation="abc,cd->abd",
|
||||
bias_axes="d",
|
||||
output_shape=(None, config.hidden_size),
|
||||
kernel_initializer=get_initializer(config.initializer_range),
|
||||
name="dense",
|
||||
)
|
||||
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
|
||||
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
|
||||
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
|
||||
|
||||
def call(self, hidden_states, input_tensor, training=False):
|
||||
hidden_states = self.dense(hidden_states)
|
||||
hidden_states = self.dropout(hidden_states, training=training)
|
||||
hidden_states = self.LayerNorm(hidden_states + input_tensor)
|
||||
hidden_states = self.dense(inputs=hidden_states)
|
||||
hidden_states = self.dropout(inputs=hidden_states, training=training)
|
||||
hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
|
|
@ -191,52 +191,52 @@ class TF{{cookiecutter.camelcase_modelname}}SelfAttention(tf.keras.layers.Layer)
|
|||
|
||||
if config.hidden_size % config.num_attention_heads != 0:
|
||||
raise ValueError(
|
||||
"The hidden size (%d) is not a multiple of the number of attention "
|
||||
"heads (%d)" % (config.hidden_size, config.num_attention_heads)
|
||||
f"The hidden size ({config.hidden_size}) is not a multiple of the number "
|
||||
f"of attention heads ({config.num_attention_heads})"
|
||||
)
|
||||
|
||||
self.num_attention_heads = config.num_attention_heads
|
||||
assert config.hidden_size % config.num_attention_heads == 0
|
||||
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
|
||||
self.all_head_size = self.num_attention_heads * self.attention_head_size
|
||||
self.query = tf.keras.layers.Dense(
|
||||
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
|
||||
self.query = tf.keras.layers.experimental.EinsumDense(
|
||||
equation="abc,cde->abde",
|
||||
output_shape=(None, config.num_attention_heads, self.attention_head_size),
|
||||
bias_axes="de",
|
||||
kernel_initializer=get_initializer(initializer_range=config.initializer_range),
|
||||
name="query",
|
||||
)
|
||||
self.key = tf.keras.layers.Dense(
|
||||
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
|
||||
self.key = tf.keras.layers.experimental.EinsumDense(
|
||||
equation="abc,cde->abde",
|
||||
output_shape=(None, config.num_attention_heads, self.attention_head_size),
|
||||
bias_axes="de",
|
||||
kernel_initializer=get_initializer(initializer_range=config.initializer_range),
|
||||
name="key",
|
||||
)
|
||||
self.value = tf.keras.layers.Dense(
|
||||
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
|
||||
self.value = tf.keras.layers.experimental.EinsumDense(
|
||||
equation="abc,cde->abde",
|
||||
output_shape=(None, config.num_attention_heads, self.attention_head_size),
|
||||
bias_axes="de",
|
||||
kernel_initializer=get_initializer(initializer_range=config.initializer_range),
|
||||
name="value",
|
||||
)
|
||||
self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob)
|
||||
self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
|
||||
|
||||
def transpose_for_scores(self, x, batch_size):
|
||||
x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size))
|
||||
def call(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False, training=False):
|
||||
query_layer = self.query(inputs=hidden_states)
|
||||
key_layer = self.key(inputs=hidden_states)
|
||||
value_layer = self.value(inputs=hidden_states)
|
||||
|
||||
return tf.transpose(x, perm=[0, 2, 1, 3])
|
||||
|
||||
def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False):
|
||||
batch_size = shape_list(hidden_states)[0]
|
||||
mixed_query_layer = self.query(hidden_states)
|
||||
mixed_key_layer = self.key(hidden_states)
|
||||
mixed_value_layer = self.value(hidden_states)
|
||||
query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
|
||||
key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
|
||||
value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
|
||||
|
||||
# Take the dot product between "query" and "key" to get the raw attention scores.
|
||||
attention_scores = tf.matmul(
|
||||
query_layer, key_layer, transpose_b=True
|
||||
) # (batch size, num_heads, seq_len_q, seq_len_k)
|
||||
dk = tf.cast(shape_list(key_layer)[-1], attention_scores.dtype) # scale attention_scores
|
||||
attention_scores = attention_scores / tf.math.sqrt(dk)
|
||||
# Take the dot product between "query" and "key" to get the raw
|
||||
# attention scores.
|
||||
dk = tf.cast(x=self.attention_head_size, dtype=query_layer.dtype)
|
||||
query_layer = tf.multiply(x=query_layer, y=tf.math.rsqrt(x=dk))
|
||||
attention_scores = tf.einsum("aecd,abcd->acbe", key_layer, query_layer)
|
||||
|
||||
if attention_mask is not None:
|
||||
# Apply the attention mask is (precomputed for all layers in TF{{cookiecutter.camelcase_modelname}}Model call() function)
|
||||
attention_scores = attention_scores + attention_mask
|
||||
|
||||
# Normalize the attention scores to probabilities.
|
||||
attention_probs = tf.nn.softmax(attention_scores, axis=-1)
|
||||
attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1)
|
||||
|
||||
# This is actually dropping out entire tokens to attend to, which might
|
||||
# seem a bit unusual, but is taken from the original Transformer paper.
|
||||
|
@ -244,14 +244,10 @@ class TF{{cookiecutter.camelcase_modelname}}SelfAttention(tf.keras.layers.Layer)
|
|||
|
||||
# Mask heads if we want to
|
||||
if head_mask is not None:
|
||||
attention_probs = attention_probs * head_mask
|
||||
attention_scores = attention_scores * head_mask
|
||||
|
||||
context_layer = tf.matmul(attention_probs, value_layer)
|
||||
context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
|
||||
context_layer = tf.reshape(
|
||||
context_layer, (batch_size, -1, self.all_head_size)
|
||||
) # (batch_size, seq_len_q, all_head_size)
|
||||
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
|
||||
attention_output = tf.einsum("acbe,aecd->abcd", attention_probs, value_layer)
|
||||
outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)
|
||||
|
||||
return outputs
|
||||
|
||||
|
@ -261,16 +257,29 @@ class TF{{cookiecutter.camelcase_modelname}}SelfOutput(tf.keras.layers.Layer):
|
|||
def __init__(self, config, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.dense = tf.keras.layers.Dense(
|
||||
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
|
||||
if config.hidden_size % config.num_attention_heads != 0:
|
||||
raise ValueError(
|
||||
f"The hidden size ({config.hidden_size}) is not a multiple of the number "
|
||||
f"of attention heads ({config.num_attention_heads})"
|
||||
)
|
||||
|
||||
self.num_attention_heads = config.num_attention_heads
|
||||
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
|
||||
self.all_head_size = self.num_attention_heads * self.attention_head_size
|
||||
self.dense = tf.keras.layers.experimental.EinsumDense(
|
||||
equation="abcd,cde->abe",
|
||||
output_shape=(None, self.all_head_size),
|
||||
bias_axes="e",
|
||||
kernel_initializer=get_initializer(initializer_range=config.initializer_range),
|
||||
name="dense",
|
||||
)
|
||||
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
|
||||
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
|
||||
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
|
||||
|
||||
def call(self, hidden_states, input_tensor, training=False):
|
||||
hidden_states = self.dense(hidden_states)
|
||||
hidden_states = self.dropout(hidden_states, training=training)
|
||||
hidden_states = self.LayerNorm(hidden_states + input_tensor)
|
||||
hidden_states = self.dense(inputs=hidden_states)
|
||||
hidden_states = self.dropout(inputs=hidden_states, training=training)
|
||||
hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
|
Loading…
Reference in New Issue