Add layerdrop

This commit is contained in:
Hang Le 2020-01-30 11:40:18 +01:00 committed by Lysandre Debut
parent df27648bd9
commit b43cb09aaa
3 changed files with 13 additions and 3 deletions

View File

@ -51,6 +51,7 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
10. `CamemBERT <https://huggingface.co/transformers/model_doc/camembert.html>`_ (from FAIR, Inria, Sorbonne Université) released together with the paper `CamemBERT: a Tasty French Language Model <https://arxiv.org/abs/1911.03894>`_ by Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suarez, Yoann Dupont, Laurent Romary, Eric Villemonte de la Clergerie, Djame Seddah, and Benoît Sagot.
11. `ALBERT <https://github.com/google-research/ALBERT>`_ (from Google Research), released together with the paper a `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations <https://arxiv.org/abs/1909.11942>`_ by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
12. `XLM-RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/xlmr>`_ (from Facebook AI), released together with the paper `Unsupervised Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`_ by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
13. `FlauBERT <https://github.com/getalp/Flaubert>`_ (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model Pre-training for French <https://arxiv.org/abs/1912.05372>`_ by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
.. toctree::
:maxdepth: 2

View File

@ -45,7 +45,10 @@ class FlaubertConfig(XLMConfig):
Args:
pre_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to apply the layer normalization before or after the feed forward layer following the
attention in each layer.
attention in each layer (Vaswani et al., Tensor2Tensor for Neural Machine Translation. 2018)
layerdrop (:obj:`float`, `optional`, defaults to 0.0):
Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand
with Structured Dropout. ICLR 2020)
vocab_size (:obj:`int`, optional, defaults to 30145):
Vocabulary size of the XLM model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLMModel`.

View File

@ -16,6 +16,7 @@
import logging
import random
import torch
from torch.nn import functional as F
@ -113,8 +114,8 @@ class FlaubertModel(XLMModel):
def __init__(self, config): # , dico, is_encoder, with_output):
super(FlaubertModel, self).__init__(config)
self.layerdrop = 0.0 if not hasattr(config, "layerdrop") else config.layerdrop
self.pre_norm = False if not hasattr(config, "pre_norm") else config.pre_norm
self.layerdrop = getattr(config, "layerdrop", 0.0)
self.pre_norm = getattr(config, "pre_norm", False)
@add_start_docstrings_to_callable(FLAUBERT_INPUTS_DOCSTRING)
def forward(
@ -243,6 +244,11 @@ class FlaubertModel(XLMModel):
hidden_states = ()
attentions = ()
for i in range(self.n_layers):
# LayerDrop
dropout_probability = random.uniform(0, 1)
if self.training and (dropout_probability < self.layerdrop):
continue
if self.output_hidden_states:
hidden_states = hidden_states + (tensor,)