diff --git a/docs/source/model_doc/squeezebert.rst b/docs/source/model_doc/squeezebert.rst index c5c986ca7c..fc2bd18506 100644 --- a/docs/source/model_doc/squeezebert.rst +++ b/docs/source/model_doc/squeezebert.rst @@ -1,8 +1,8 @@ SqueezeBERT ----------------------------------------------------- +----------------------------------------------------------------------------------------------------------------------- Overview -~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The SqueezeBERT model was proposed in `SqueezeBERT: What can computer vision teach NLP about efficient neural networks? @@ -40,14 +40,14 @@ Tips: `squeezebert/squeezebert-mnli-headless` checkpoint. SqueezeBertConfig -~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.SqueezeBertConfig :members: SqueezeBertTokenizer -~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.SqueezeBertTokenizer :members: build_inputs_with_special_tokens, get_special_tokens_mask, @@ -55,49 +55,49 @@ SqueezeBertTokenizer SqueezeBertTokenizerFast -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.SqueezeBertTokenizerFast :members: SqueezeBertModel -~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.SqueezeBertModel :members: SqueezeBertForMaskedLM -~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.SqueezeBertForMaskedLM :members: SqueezeBertForSequenceClassification -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.SqueezeBertForSequenceClassification :members: SqueezeBertForMultipleChoice -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.SqueezeBertForMultipleChoice :members: SqueezeBertForTokenClassification -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.SqueezeBertForTokenClassification :members: SqueezeBertForQuestionAnswering -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.SqueezeBertForQuestionAnswering :members: diff --git a/src/transformers/configuration_squeezebert.py b/src/transformers/configuration_squeezebert.py index 361eb8dcdc..e83adba353 100644 --- a/src/transformers/configuration_squeezebert.py +++ b/src/transformers/configuration_squeezebert.py @@ -39,52 +39,51 @@ class SqueezeBertConfig(PretrainedConfig): Args: - vocab_size (:obj:`int`, optional, defaults to 30522): - Vocabulary size of the SqueezeBERT model. Defines the different tokens that - can be represented by the :obj:`inputs_ids` passed to the forward method of :class:`~transformers.SqueezeBertModel`. - hidden_size (:obj:`int`, optional, defaults to 512): + vocab_size (:obj:`int`, `optional`, defaults to 30522): + Vocabulary size of the SqueezeBERT model. Defines the number of different tokens that can be + represented by the :obj:`inputs_ids` passed when calling :class:`~transformers.SqueezeBertModel`. + hidden_size (:obj:`int`, `optional`, defaults to 768): Dimensionality of the encoder layers and the pooler layer. - num_hidden_layers (:obj:`int`, optional, defaults to 24): + num_hidden_layers (:obj:`int`, `optional`, defaults to 12): Number of hidden layers in the Transformer encoder. - num_attention_heads (:obj:`int`, optional, defaults to 4): + num_attention_heads (:obj:`int`, `optional`, defaults to 12): Number of attention heads for each attention layer in the Transformer encoder. - intermediate_size (:obj:`int`, optional, defaults to 512): - Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. - hidden_act (:obj:`str` or :obj:`function`, optional, defaults to :obj:`"gelu"`): + intermediate_size (:obj:`int`, `optional`, defaults to 3072): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. - If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"`, :obj:`"gelu_new"`, :obj:`"gelu_fast"`, - and :obj:`"mish"` are supported. - hidden_dropout_prob (:obj:`float`, optional, defaults to 0.0): - The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. - attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1): + If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): + The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): The dropout ratio for the attention probabilities. - max_position_embeddings (:obj:`int`, optional, defaults to 512): + max_position_embeddings (:obj:`int`, `optional`, defaults to 512): The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). - type_vocab_size (:obj:`int`, optional, defaults to 2): - The vocabulary size of the :obj:`token_type_ids` passed into :class:`~transformers.SqueezeBertModel`. - initializer_range (:obj:`float`, optional, defaults to 0.02): + type_vocab_size (:obj:`int`, `optional`, defaults to 2): + The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BertModel` or + :class:`~transformers.TFBertModel`. + initializer_range (:obj:`float`, `optional`, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): - The epsilon used by the layer normalization layers. + layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12): - pad_token_id (:obj:`int`, optional, defaults to 0): + pad_token_id (:obj:`int`, `optional`, defaults to 0): The ID of the token in the word embedding to use as padding. - embedding_size (:obj:`int`, optional, defaults to 128): + embedding_size (:obj:`int`, `optional`, defaults to 768): The dimension of the word embedding vectors. - q_groups (:obj:`int`, optional, defaults to 4): - The number of groups in Q layer - k_groups (:obj:`int`, optional, defaults to 4): - The number of groups in K layer - v_groups (:obj:`int`, optional, defaults to 4): - The number of groups in V layer - post_attention_groups (:obj:`int`, optional, defaults to 1): - The number of groups in the first feed forward network layer - intermediate_groups (:obj:`int`, optional, defaults to 4): - The number of groups in the second feed forward network layer - output_groups (:obj:`int`, optional, defaults to 4): - The number of groups in the third feed forward network layer + q_groups (:obj:`int`, `optional`, defaults to 4): + The number of groups in Q layer. + k_groups (:obj:`int`, `optional`, defaults to 4): + The number of groups in K layer. + v_groups (:obj:`int`, `optional`, defaults to 4): + The number of groups in V layer. + post_attention_groups (:obj:`int`, `optional`, defaults to 1): + The number of groups in the first feed forward network layer. + intermediate_groups (:obj:`int`, `optional`, defaults to 4): + The number of groups in the second feed forward network layer. + output_groups (:obj:`int`, `optional`, defaults to 4): + The number of groups in the third feed forward network layer. Example: diff --git a/src/transformers/modeling_squeezebert.py b/src/transformers/modeling_squeezebert.py index 8db6941504..d7e755ed25 100644 --- a/src/transformers/modeling_squeezebert.py +++ b/src/transformers/modeling_squeezebert.py @@ -42,10 +42,6 @@ logger = logging.get_logger(__name__) _CONFIG_FOR_DOC = "SqueezeBertConfig" _TOKENIZER_FOR_DOC = "SqueezeBertTokenizer" -#################################################### -# This list contrains shortcut names for some of -# the pretrained weights provided with the models -#################################################### SQUEEZEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ "squeezebert/squeezebert-uncased", "squeezebert/squeezebert-mnli", @@ -407,11 +403,17 @@ class SqueezeBertPreTrainedModel(PreTrainedModel): module.bias.data.zero_() -SQUEEZEBERT_START_DOCSTRING = r""" The SqueezeBERT model was proposed in +SQUEEZEBERT_START_DOCSTRING = r""" + + The SqueezeBERT model was proposed in `SqueezeBERT: What can computer vision teach NLP about efficient neural networks? `__ by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer - This model is a PyTorch `torch.nn.Module `_ sub-class. + This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic + methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, + pruning heads etc.) + + This model is also a PyTorch `torch.nn.Module `__ subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. @@ -445,47 +447,55 @@ SQUEEZEBERT_START_DOCSTRING = r""" The SqueezeBERT model was proposed in """ SQUEEZEBERT_INPUTS_DOCSTRING = r""" - Inputs: - input_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`): + Args: + input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`): Indices of input sequence tokens in the vocabulary. - Indices can be obtained using :class:`transformers.SqueezeBertTokenizer`. - See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.__call__` for details. + Indices can be obtained using :class:`~transformers.SqueezeBertTokenizer`. + See :meth:`transformers.PreTrainedTokenizer.encode` and + :meth:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ - attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): + attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: - ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **maked**. `What are attention masks? <../glossary.html#attention-mask>`__ - token_type_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): + token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`): Segment token indices to indicate first and second portions of the inputs. - Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` - corresponds to a `sentence B` token + Indices are selected in ``[0, 1]``: + + - 0 corresponds to a `sentence A` token, + - 1 corresponds to a `sentence B` token. `What are token type IDs? <../glossary.html#token-type-ids>`_ - position_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): + position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. `What are position IDs? <../glossary.html#position-ids>`_ - head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): + head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: - :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. - inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. - This is useful if you want more control over how to convert `input_ids` indices into associated vectors - than the model's internal embedding lookup matrix. - output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. - output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a - plain tuple. + This is useful if you want more control over how to convert :obj:`input_ids` indices into associated + vectors than the model's internal embedding lookup matrix. + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned + tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for + more detail. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. """ @@ -625,7 +635,7 @@ class SqueezeBertForMaskedLM(SqueezeBertPreTrainedModel): return_dict=None, ): r""" - labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels @@ -702,7 +712,7 @@ class SqueezeBertForSequenceClassification(SqueezeBertPreTrainedModel): return_dict=None, ): r""" - labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), @@ -787,7 +797,7 @@ class SqueezeBertForMultipleChoice(SqueezeBertPreTrainedModel): return_dict=None, ): r""" - labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) @@ -877,7 +887,7 @@ class SqueezeBertForTokenClassification(SqueezeBertPreTrainedModel): return_dict=None, ): r""" - labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. """ @@ -927,8 +937,9 @@ class SqueezeBertForTokenClassification(SqueezeBertPreTrainedModel): @add_start_docstrings( - """SqueezeBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear - layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, + """SqueezeBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD + (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, SQUEEZEBERT_START_DOCSTRING, ) class SqueezeBertForQuestionAnswering(SqueezeBertPreTrainedModel): @@ -963,11 +974,11 @@ class SqueezeBertForQuestionAnswering(SqueezeBertPreTrainedModel): return_dict=None, ): r""" - start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): + start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. - end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): + end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. diff --git a/src/transformers/tokenization_squeezebert.py b/src/transformers/tokenization_squeezebert.py index c761d8a2c3..dc34122614 100644 --- a/src/transformers/tokenization_squeezebert.py +++ b/src/transformers/tokenization_squeezebert.py @@ -65,8 +65,8 @@ class SqueezeBertTokenizerFast(BertTokenizerFast): r""" Constructs a "Fast" SqueezeBert tokenizer (backed by HuggingFace's `tokenizers` library). - :class:`~transformers.SqueezeBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs end-to-end - tokenization: punctuation splitting + wordpiece. + :class:`~transformers.SqueezeBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and + runs end-to-end tokenization: punctuation splitting + wordpiece. Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning parameters.