Add layerdrop
This commit is contained in:
parent
df27648bd9
commit
b43cb09aaa
|
@ -51,6 +51,7 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
|
|||
10. `CamemBERT <https://huggingface.co/transformers/model_doc/camembert.html>`_ (from FAIR, Inria, Sorbonne Université) released together with the paper `CamemBERT: a Tasty French Language Model <https://arxiv.org/abs/1911.03894>`_ by Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suarez, Yoann Dupont, Laurent Romary, Eric Villemonte de la Clergerie, Djame Seddah, and Benoît Sagot.
|
||||
11. `ALBERT <https://github.com/google-research/ALBERT>`_ (from Google Research), released together with the paper a `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations <https://arxiv.org/abs/1909.11942>`_ by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
|
||||
12. `XLM-RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/xlmr>`_ (from Facebook AI), released together with the paper `Unsupervised Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`_ by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
|
||||
13. `FlauBERT <https://github.com/getalp/Flaubert>`_ (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model Pre-training for French <https://arxiv.org/abs/1912.05372>`_ by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
|
|
@ -45,7 +45,10 @@ class FlaubertConfig(XLMConfig):
|
|||
Args:
|
||||
pre_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether to apply the layer normalization before or after the feed forward layer following the
|
||||
attention in each layer.
|
||||
attention in each layer (Vaswani et al., Tensor2Tensor for Neural Machine Translation. 2018)
|
||||
layerdrop (:obj:`float`, `optional`, defaults to 0.0):
|
||||
Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand
|
||||
with Structured Dropout. ICLR 2020)
|
||||
vocab_size (:obj:`int`, optional, defaults to 30145):
|
||||
Vocabulary size of the XLM model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLMModel`.
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
|
||||
|
||||
import logging
|
||||
import random
|
||||
|
||||
import torch
|
||||
from torch.nn import functional as F
|
||||
|
@ -113,8 +114,8 @@ class FlaubertModel(XLMModel):
|
|||
|
||||
def __init__(self, config): # , dico, is_encoder, with_output):
|
||||
super(FlaubertModel, self).__init__(config)
|
||||
self.layerdrop = 0.0 if not hasattr(config, "layerdrop") else config.layerdrop
|
||||
self.pre_norm = False if not hasattr(config, "pre_norm") else config.pre_norm
|
||||
self.layerdrop = getattr(config, "layerdrop", 0.0)
|
||||
self.pre_norm = getattr(config, "pre_norm", False)
|
||||
|
||||
@add_start_docstrings_to_callable(FLAUBERT_INPUTS_DOCSTRING)
|
||||
def forward(
|
||||
|
@ -243,6 +244,11 @@ class FlaubertModel(XLMModel):
|
|||
hidden_states = ()
|
||||
attentions = ()
|
||||
for i in range(self.n_layers):
|
||||
# LayerDrop
|
||||
dropout_probability = random.uniform(0, 1)
|
||||
if self.training and (dropout_probability < self.layerdrop):
|
||||
continue
|
||||
|
||||
if self.output_hidden_states:
|
||||
hidden_states = hidden_states + (tensor,)
|
||||
|
||||
|
|
Loading…
Reference in New Issue