fix some typos in docs, comments, logging/errors (#11432)
This commit is contained in:
parent
e3e70f9551
commit
b24ead87e1
|
@ -57,14 +57,14 @@ class AddNewModelCommand(BaseTransformersCLICommand):
|
|||
if not _has_cookiecutter:
|
||||
raise ImportError(
|
||||
"Model creation dependencies are required to use the `add_new_model` command. Install them by running "
|
||||
"the folowing at the root of your `transformers` clone:\n\n\t$ pip install -e .[modelcreation]\n"
|
||||
"the following at the root of your `transformers` clone:\n\n\t$ pip install -e .[modelcreation]\n"
|
||||
)
|
||||
# Ensure that there is no other `cookiecutter-template-xxx` directory in the current working directory
|
||||
directories = [directory for directory in os.listdir() if "cookiecutter-template-" == directory[:22]]
|
||||
if len(directories) > 0:
|
||||
raise ValueError(
|
||||
"Several directories starting with `cookiecutter-template-` in current working directory. "
|
||||
"Please clean your directory by removing all folders startign with `cookiecutter-template-` or "
|
||||
"Please clean your directory by removing all folders starting with `cookiecutter-template-` or "
|
||||
"change your working directory."
|
||||
)
|
||||
|
||||
|
|
|
@ -244,7 +244,7 @@ def squad_convert_example_to_features(
|
|||
cls_index = span["input_ids"].index(tokenizer.cls_token_id)
|
||||
|
||||
# p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
|
||||
# Original TF implem also keep the classification token (set to 0)
|
||||
# Original TF implementation also keep the classification token (set to 0)
|
||||
p_mask = np.ones_like(span["token_type_ids"])
|
||||
if tokenizer.padding_side == "right":
|
||||
p_mask[len(truncated_query) + sequence_added_tokens :] = 0
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
Sequence feature extraction class for common feature extrcactors to preprocess sequences.
|
||||
Sequence feature extraction class for common feature extractors to preprocess sequences.
|
||||
"""
|
||||
from typing import Dict, List, Optional, Union
|
||||
|
||||
|
|
|
@ -551,7 +551,7 @@ BACKENDS_MAPPING = OrderedDict(
|
|||
("sklearn", (is_sklearn_available, SKLEARN_IMPORT_ERROR)),
|
||||
("speech", (is_speech_available, SPEECH_IMPORT_ERROR)),
|
||||
("tf", (is_tf_available, TENSORFLOW_IMPORT_ERROR)),
|
||||
("tokenziers", (is_tokenizers_available, TOKENIZERS_IMPORT_ERROR)),
|
||||
("tokenizers", (is_tokenizers_available, TOKENIZERS_IMPORT_ERROR)),
|
||||
("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)),
|
||||
("vision", (is_vision_available, VISION_IMPORT_ERROR)),
|
||||
]
|
||||
|
|
|
@ -446,7 +446,7 @@ class NoBadWordsLogitsProcessor(LogitsProcessor):
|
|||
|
||||
class PrefixConstrainedLogitsProcessor(LogitsProcessor):
|
||||
r"""
|
||||
:class:`transformers.LogitsProcessor` that enforces contrained generation and is useful for prefix-conditioned
|
||||
:class:`transformers.LogitsProcessor` that enforces constrained generation and is useful for prefix-conditioned
|
||||
constrained generation. See `Autoregressive Entity Retrieval <https://arxiv.org/abs/2010.00904>`__ for more
|
||||
information.
|
||||
|
||||
|
|
|
@ -23,7 +23,7 @@ STOPPING_CRITERIA_INPUTS_DOCSTRING = r"""
|
|||
Prediction scores of a language modeling head. These can be scores for each vocabulary token before SoftMax
|
||||
or scores for each vocabulary token after SoftMax.
|
||||
kwargs:
|
||||
Additional stopping critera specific kwargs.
|
||||
Additional stopping criteria specific kwargs.
|
||||
|
||||
Return:
|
||||
:obj:`bool`. :obj:`False` indicates we should continue, :obj:`True` indicates we should stop.
|
||||
|
|
|
@ -442,8 +442,8 @@ class TFGenerationMixin:
|
|||
**kwargs
|
||||
):
|
||||
"""
|
||||
Generate sequences for each example without beam search (num_beams == 1). All returned sequence are generated
|
||||
independantly.
|
||||
Generate sequences for each example without beam search (num_beams == 1). All returned sequences are generated
|
||||
independently.
|
||||
"""
|
||||
|
||||
# length of generated sentences / unfinished sentences
|
||||
|
|
|
@ -821,7 +821,7 @@ class GenerationMixin:
|
|||
... "at least two people were killed in a suspected bomb attack on a passenger bus "
|
||||
... "in the strife-torn southern philippines on monday , the military said."
|
||||
... )
|
||||
>>> # encode input contex
|
||||
>>> # encode input context
|
||||
>>> input_ids = tokenizer(document, return_tensors="pt").input_ids
|
||||
>>> # generate 3 independent sequences using beam search decoding (5 beams)
|
||||
>>> # with T5 encoder-decoder model conditioned on short news article.
|
||||
|
|
|
@ -94,7 +94,7 @@ class FlaxPreTrainedModel(PushToHubMixin):
|
|||
self.key = PRNGKey(seed)
|
||||
self.dtype = dtype
|
||||
|
||||
# randomely initialized parameters
|
||||
# randomly initialized parameters
|
||||
random_params = self.init_weights(self.key, input_shape)
|
||||
|
||||
# save required_params as set
|
||||
|
|
|
@ -343,7 +343,7 @@ class CausalLMOutputWithPast(ModelOutput):
|
|||
Language modeling loss (for next-token prediction).
|
||||
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (:obj:`tuple(tupel(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
|
||||
past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
|
||||
Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
|
||||
of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
|
@ -423,7 +423,7 @@ class SequenceClassifierOutputWithPast(ModelOutput):
|
|||
Classification (or regression if config.num_labels==1) loss.
|
||||
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
|
||||
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||
past_key_values (:obj:`tuple(tupel(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
|
||||
past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
|
||||
Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
|
||||
of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
|
|
|
@ -51,7 +51,7 @@ def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove="")
|
|||
) # '_._' is replaced by a level separation (can be used to convert TF2.0 lists in PyTorch nn.ModulesList)
|
||||
tf_name = re.sub(r"//+", "/", tf_name) # Remove empty levels at the end
|
||||
tf_name = tf_name.split("/") # Convert from TF2.0 '/' separators to PyTorch '.' separators
|
||||
# Some weights have a single name withtout "/" such as final_logits_bias in BART
|
||||
# Some weights have a single name without "/" such as final_logits_bias in BART
|
||||
if len(tf_name) > 1:
|
||||
tf_name = tf_name[1:] # Remove level zero
|
||||
|
||||
|
|
|
@ -659,7 +659,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
|
|||
|
||||
Args:
|
||||
inputs (:obj:`Dict[str, tf.Tensor]`):
|
||||
The input of the saved model as a dictionnary of tensors.
|
||||
The input of the saved model as a dictionary of tensors.
|
||||
"""
|
||||
output = self.call(inputs)
|
||||
|
||||
|
@ -944,7 +944,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
|
|||
vectors from the end. If not provided or :obj:`None`, just returns None
|
||||
|
||||
Return:
|
||||
:obj:`tf.Variable`: Pointer to the resized decoder or None if the output embeddings are differents of the
|
||||
:obj:`tf.Variable`: Pointer to the resized decoder or None if the output embeddings are different from the
|
||||
input ones.
|
||||
"""
|
||||
new_lm_head_decoder = old_lm_head_decoder
|
||||
|
|
|
@ -291,7 +291,7 @@ class ModuleUtilsMixin:
|
|||
The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard).
|
||||
num_hidden_layers (:obj:`int`):
|
||||
The number of hidden layers in the model.
|
||||
is_attention_chunked: (:obj:`bool`, `optional, defaults to :obj:`False`):
|
||||
is_attention_chunked: (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not the attentions scores are computed by chunks or not.
|
||||
|
||||
Returns:
|
||||
|
|
|
@ -716,7 +716,7 @@ AutoModelForPreTraining = auto_class_factory(
|
|||
"AutoModelForPreTraining", MODEL_FOR_PRETRAINING_MAPPING, head_doc="pretraining"
|
||||
)
|
||||
|
||||
# Private on puprose, the public class will add the deprecation warnings.
|
||||
# Private on purpose, the public class will add the deprecation warnings.
|
||||
_AutoModelWithLMHead = auto_class_factory(
|
||||
"AutoModelWithLMHead", MODEL_WITH_LM_HEAD_MAPPING, head_doc="language modeling"
|
||||
)
|
||||
|
|
|
@ -103,7 +103,7 @@ FlaxAutoModelForMaskedLM = auto_class_factory(
|
|||
)
|
||||
|
||||
FlaxAutoModelForSequenceClassification = auto_class_factory(
|
||||
"AFlaxutoModelForSequenceClassification",
|
||||
"FlaxAutoModelForSequenceClassification",
|
||||
FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
|
||||
head_doc="sequence classification",
|
||||
)
|
||||
|
|
|
@ -469,7 +469,7 @@ TFAutoModelForPreTraining = auto_class_factory(
|
|||
"TFAutoModelForPreTraining", TF_MODEL_FOR_PRETRAINING_MAPPING, head_doc="pretraining"
|
||||
)
|
||||
|
||||
# Private on puprose, the public class will add the deprecation warnings.
|
||||
# Private on purpose, the public class will add the deprecation warnings.
|
||||
_TFAutoModelWithLMHead = auto_class_factory(
|
||||
"TFAutoModelWithLMHead", TF_MODEL_WITH_LM_HEAD_MAPPING, head_doc="language modeling"
|
||||
)
|
||||
|
|
|
@ -171,7 +171,7 @@ class BartConfig(PretrainedConfig):
|
|||
self.gradient_checkpointing = gradient_checkpointing
|
||||
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
|
||||
|
||||
# ensure backward compatibilty for BART CNN models
|
||||
# ensure backward compatibility for BART CNN models
|
||||
if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False):
|
||||
self.forced_bos_token_id = self.bos_token_id
|
||||
warnings.warn(
|
||||
|
|
|
@ -111,7 +111,7 @@ class BartLearnedPositionalEmbedding(nn.Embedding):
|
|||
|
||||
def __init__(self, num_embeddings: int, embedding_dim: int):
|
||||
# Bart is set up so that if padding_idx is specified then offset the embedding ids by 2
|
||||
# and adjust num_embeddings appropriately. Other models dont have this hack
|
||||
# and adjust num_embeddings appropriately. Other models don't have this hack
|
||||
self.offset = 2
|
||||
super().__init__(num_embeddings + self.offset, embedding_dim)
|
||||
|
||||
|
@ -236,9 +236,9 @@ class BartAttention(nn.Module):
|
|||
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
|
||||
|
||||
if output_attentions:
|
||||
# this operation is a bit akward, but it's required to
|
||||
# this operation is a bit awkward, but it's required to
|
||||
# make sure that attn_weights keeps its gradient.
|
||||
# In order to do so, attn_weights have to reshaped
|
||||
# In order to do so, attn_weights have to be reshaped
|
||||
# twice and have to be reused in the following
|
||||
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
|
||||
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
|
||||
|
|
|
@ -116,7 +116,7 @@ class TFBartLearnedPositionalEmbedding(TFSharedEmbeddings):
|
|||
|
||||
def __init__(self, num_embeddings: int, embedding_dim: int, **kwargs):
|
||||
# Bart is set up so that if padding_idx is specified then offset the embedding ids by 2
|
||||
# and adjust num_embeddings appropriately. Other models dont have this hack
|
||||
# and adjust num_embeddings appropriately. Other models don't have this hack
|
||||
self.offset = 2
|
||||
super().__init__(num_embeddings + self.offset, embedding_dim, **kwargs)
|
||||
|
||||
|
|
|
@ -304,7 +304,7 @@ class MecabTokenizer:
|
|||
|
||||
|
||||
class CharacterTokenizer:
|
||||
"""Runs Character tokenziation."""
|
||||
"""Runs Character tokenization."""
|
||||
|
||||
def __init__(self, vocab, unk_token, normalize_text=True):
|
||||
"""
|
||||
|
|
|
@ -451,7 +451,7 @@ Twitter-aware tokenizer, designed to be flexible and easy to adapt to new domain
|
|||
the class Tokenizer.
|
||||
|
||||
4. When instantiating Tokenizer objects, there is a single option: preserve_case. By default, it is set to True. If it
|
||||
is set to False, then the tokenizer will downcase everything except for emoticons.
|
||||
is set to False, then the tokenizer will lowercase everything except for emoticons.
|
||||
|
||||
"""
|
||||
|
||||
|
|
|
@ -724,7 +724,7 @@ class BigBirdBlockSparseAttention(nn.Module):
|
|||
band_product, dim=-1
|
||||
) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, (5+n_rand_blocks)*to_block_size]
|
||||
|
||||
# contibution of sliding keys
|
||||
# contribution of sliding keys
|
||||
# [bsz, n_heads, m//from_block_size-4, from_block_size, 3*to_block_size] x [bsz, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1]
|
||||
context_layer = self.torch_bmm_nd(
|
||||
attn_weights[:, :, :, :, to_block_size : 4 * to_block_size], exp_blocked_value_matrix, ndim=5
|
||||
|
@ -876,7 +876,7 @@ class BigBirdBlockSparseAttention(nn.Module):
|
|||
attn_probs_view[:, :, q_idx, :, q_idx : q_idx + 3, :] = right_slice.view(
|
||||
bsz, n_heads, from_block_size, 3, to_block_size
|
||||
) # inner_band_product
|
||||
# global keys (correspomding to 1st key block)
|
||||
# global keys (corresponding to 1st key block)
|
||||
attention_probs[:, :, 2 * from_block_size : -2 * from_block_size, :to_block_size] = attn_weights[
|
||||
:, :, :, :, :to_block_size
|
||||
].view(
|
||||
|
@ -946,7 +946,7 @@ class BigBirdBlockSparseAttention(nn.Module):
|
|||
|
||||
@staticmethod
|
||||
def torch_gather_b2(params, indices):
|
||||
# this operation is equilvalent to tf.gather when batch_dims=2
|
||||
# this operation is equivalent to tf.gather when batch_dims=2
|
||||
|
||||
if params.shape[:2] != indices.shape[:2]:
|
||||
raise ValueError(
|
||||
|
@ -1054,7 +1054,7 @@ class BigBirdBlockSparseAttention(nn.Module):
|
|||
to_block_size: int. size of block in to sequence.
|
||||
num_rand_blocks: int. Number of random chunks per row.
|
||||
last_idx: if -1 then num_rand_blocks blocks chosen anywhere in to sequence,
|
||||
if positive then num_rand_blocks blocks choosen only upto last_idx.
|
||||
if positive then num_rand_blocks blocks chosen only up to last_idx.
|
||||
|
||||
Returns:
|
||||
adjacency list of size from_seq_length//from_block_size-2 by num_rand_blocks
|
||||
|
@ -1149,7 +1149,7 @@ class BigBirdBlockSparseAttention(nn.Module):
|
|||
plan_block_length = np.array(plan_from_length) // from_block_size
|
||||
# till when to follow plan
|
||||
max_plan_idx = plan_from_length.index(from_seq_length)
|
||||
# Random Attention adjajency list
|
||||
# Random Attention adjacency list
|
||||
rand_attn = [
|
||||
np.zeros((num_blocks, np.sum(plan_num_rand_blocks[: max_plan_idx + 1])), dtype=np.int32)
|
||||
for i in range(num_heads)
|
||||
|
@ -1246,8 +1246,8 @@ class BigBirdBlockSparseAttention(nn.Module):
|
|||
|
||||
Args:
|
||||
block_id: int. block id of row.
|
||||
to_start_block_id: int. random attention coloum start id.
|
||||
to_end_block_id: int. random attention coloum end id.
|
||||
to_start_block_id: int. random attention column start id.
|
||||
to_end_block_id: int. random attention column end id.
|
||||
num_rand_blocks: int. number of random blocks to be selected.
|
||||
window_block_left: int. number of blocks of window to left of a block.
|
||||
window_block_right: int. number of blocks of window to right of a block.
|
||||
|
@ -1825,7 +1825,7 @@ BIG_BIRD_INPUTS_DOCSTRING = r"""
|
|||
@dataclass
|
||||
class BigBirdForPreTrainingOutput(ModelOutput):
|
||||
"""
|
||||
Output type of :class:`~transformers.BigBirdtForPreTraining`.
|
||||
Output type of :class:`~transformers.BigBirdForPreTraining`.
|
||||
|
||||
Args:
|
||||
loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
|
||||
|
@ -2941,7 +2941,7 @@ class BigBirdForQuestionAnswering(BigBirdPreTrainedModel):
|
|||
|
||||
logits_mask = None
|
||||
if question_lengths is not None:
|
||||
# setting lengths logits to `-infi`
|
||||
# setting lengths logits to `-inf`
|
||||
logits_mask = self.prepare_question_mask(question_lengths, seqlen)
|
||||
if token_type_ids is None:
|
||||
token_type_ids = (~logits_mask).long()
|
||||
|
|
|
@ -237,9 +237,9 @@ class BlenderbotAttention(nn.Module):
|
|||
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
|
||||
|
||||
if output_attentions:
|
||||
# this operation is a bit akward, but it's required to
|
||||
# this operation is a bit awkward, but it's required to
|
||||
# make sure that attn_weights keeps its gradient.
|
||||
# In order to do so, attn_weights have to reshaped
|
||||
# In order to do so, attn_weights have to be reshaped
|
||||
# twice and have to be reused in the following
|
||||
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
|
||||
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
|
||||
|
|
|
@ -235,9 +235,9 @@ class BlenderbotSmallAttention(nn.Module):
|
|||
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
|
||||
|
||||
if output_attentions:
|
||||
# this operation is a bit akward, but it's required to
|
||||
# this operation is a bit awkward, but it's required to
|
||||
# make sure that attn_weights keeps its gradient.
|
||||
# In order to do so, attn_weights have to reshaped
|
||||
# In order to do so, attn_weights have to be reshaped
|
||||
# twice and have to be reused in the following
|
||||
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
|
||||
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
|
||||
|
|
|
@ -695,7 +695,7 @@ CONVBERT_INPUTS_DOCSTRING = r"""
|
|||
Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
|
||||
|
||||
- 1 for tokens that are **not masked**,
|
||||
- 0 for tokens that are **maked**.
|
||||
- 0 for tokens that are **masked**.
|
||||
|
||||
`What are attention masks? <../glossary.html#attention-mask>`__
|
||||
token_type_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
|
||||
|
@ -739,7 +739,7 @@ CONVBERT_INPUTS_DOCSTRING = r"""
|
|||
|
||||
|
||||
@add_start_docstrings(
|
||||
"The bare ConvBERT Model transformer outputing raw hidden-states without any specific head on top.",
|
||||
"The bare ConvBERT Model transformer outputting raw hidden-states without any specific head on top.",
|
||||
CONVBERT_START_DOCSTRING,
|
||||
)
|
||||
class TFConvBertModel(TFConvBertPreTrainedModel):
|
||||
|
|
|
@ -683,7 +683,7 @@ class CTRLForSequenceClassification(CTRLPreTrainedModel):
|
|||
sequence_lengths = -1
|
||||
logger.warning(
|
||||
f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
|
||||
f"unexpected if using padding tokens in conjuction with `inputs_embeds.`"
|
||||
f"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
|
||||
)
|
||||
|
||||
pooled_logits = logits[range(batch_size), sequence_lengths]
|
||||
|
|
|
@ -511,7 +511,7 @@ def build_relative_position(query_size, key_size, bucket_size=-1, max_position=-
|
|||
query_size (int): the length of query
|
||||
key_size (int): the length of key
|
||||
bucket_size (int): the size of position bucket
|
||||
max_position (int): the maxium allowed absolute positoin
|
||||
max_position (int): the maximum allowed absolute position
|
||||
|
||||
Return:
|
||||
:obj:`torch.LongTensor`: A tensor with shape [1, query_size, key_size]
|
||||
|
@ -698,7 +698,7 @@ class DisentangledSelfAttention(torch.nn.Module):
|
|||
relative_pos = relative_pos.unsqueeze(1)
|
||||
# bsz x height x query x key
|
||||
elif relative_pos.dim() != 4:
|
||||
raise ValueError(f"Relative postion ids must be of dim 2 or 3 or 4. {relative_pos.dim()}")
|
||||
raise ValueError(f"Relative position ids must be of dim 2 or 3 or 4. {relative_pos.dim()}")
|
||||
|
||||
att_span = self.pos_ebd_size
|
||||
relative_pos = relative_pos.long().to(query_layer.device)
|
||||
|
|
|
@ -428,7 +428,7 @@ class SPMTokenizer:
|
|||
|
||||
def _is_whitespace(char):
|
||||
"""Checks whether `chars` is a whitespace character."""
|
||||
# \t, \n, and \r are technically contorl characters but we treat them
|
||||
# \t, \n, and \r are technically control characters but we treat them
|
||||
# as whitespace since they are generally considered as such.
|
||||
if char == " " or char == "\t" or char == "\n" or char == "\r":
|
||||
return True
|
||||
|
|
|
@ -134,7 +134,7 @@ def convert_fsmt_checkpoint_to_pytorch(fsmt_checkpoint_path, pytorch_dump_folder
|
|||
f.write(json.dumps(src_vocab, ensure_ascii=False, indent=json_indent))
|
||||
|
||||
# detect whether this is a do_lower_case situation, which can be derived by checking whether we
|
||||
# have at least one upcase letter in the source vocab
|
||||
# have at least one uppercase letter in the source vocab
|
||||
do_lower_case = True
|
||||
for k in src_vocab.keys():
|
||||
if not k.islower():
|
||||
|
|
|
@ -252,7 +252,7 @@ FSMT_INPUTS_DOCSTRING = r"""
|
|||
Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0, 1]``:
|
||||
|
||||
- 1 indicates the head is **not masked**,
|
||||
- 0 indicates the heas is **masked**.
|
||||
- 0 indicates the head is **masked**.
|
||||
|
||||
decoder_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
|
||||
Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in ``[0, 1]``:
|
||||
|
@ -486,7 +486,7 @@ class FSMTEncoder(nn.Module):
|
|||
Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
|
||||
|
||||
- 1 indicates the head is **not masked**,
|
||||
- 0 indicates the heas is **masked**.
|
||||
- 0 indicates the head is **masked**.
|
||||
|
||||
Returns:
|
||||
BaseModelOutput or Tuple comprised of:
|
||||
|
@ -696,7 +696,7 @@ class FSMTDecoder(nn.Module):
|
|||
Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
|
||||
|
||||
- 1 indicates the head is **not masked**,
|
||||
- 0 indicates the heas is **masked**.
|
||||
- 0 indicates the head is **masked**.
|
||||
|
||||
cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(num_layers, num_heads)`, `optional`):
|
||||
Mask to nullify selected heads of the cross-attention modules. Mask values selected in ``[0, 1]``:
|
||||
|
|
|
@ -184,7 +184,7 @@ class FunnelAttentionStructure(nn.Module):
|
|||
self.sin_dropout = nn.Dropout(config.hidden_dropout)
|
||||
self.cos_dropout = nn.Dropout(config.hidden_dropout)
|
||||
# Track where we are at in terms of pooling from the original input, e.g., by how much the sequence length was
|
||||
# dividide.
|
||||
# divided.
|
||||
self.pooling_mult = None
|
||||
|
||||
def init_attention_inputs(self, inputs_embeds, attention_mask=None, token_type_ids=None):
|
||||
|
@ -218,7 +218,7 @@ class FunnelAttentionStructure(nn.Module):
|
|||
For the factorized attention, it returns the matrices (phi, pi, psi, omega) used in the paper, appendix A.2.2,
|
||||
final formula.
|
||||
|
||||
For the relative shif attention, it returns all possible vectors R used in the paper, appendix A.2.1, final
|
||||
For the relative shift attention, it returns all possible vectors R used in the paper, appendix A.2.1, final
|
||||
formula.
|
||||
|
||||
Paper link: https://arxiv.org/abs/2006.03236
|
||||
|
|
|
@ -169,7 +169,7 @@ class TFFunnelAttentionStructure:
|
|||
For the factorized attention, it returns the matrices (phi, pi, psi, omega) used in the paper, appendix A.2.2,
|
||||
final formula.
|
||||
|
||||
For the relative shif attention, it returns all possible vectors R used in the paper, appendix A.2.1, final
|
||||
For the relative shift attention, it returns all possible vectors R used in the paper, appendix A.2.1, final
|
||||
formula.
|
||||
|
||||
Paper link: https://arxiv.org/abs/2006.03236
|
||||
|
@ -1009,7 +1009,7 @@ class TFFunnelForPreTrainingOutput(ModelOutput):
|
|||
Args:
|
||||
logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
|
||||
Prediction scores of the head (scores for each token before SoftMax).
|
||||
hidden_states (:obj:`tuple(tf.ensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
|
||||
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
|
||||
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape :obj:`(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
|
|
|
@ -70,7 +70,7 @@ class TFAttention(tf.keras.layers.Layer):
|
|||
super().__init__(**kwargs)
|
||||
|
||||
n_state = nx # in Attention: n_state=768 (nx=n_embd)
|
||||
# [switch nx => n_state from Block to Attention to keep identical to TF implem]
|
||||
# [switch nx => n_state from Block to Attention to keep identical to TF implementation]
|
||||
assert n_state % config.n_head == 0
|
||||
self.n_ctx = n_ctx
|
||||
self.n_head = config.n_head
|
||||
|
|
|
@ -70,7 +70,7 @@ def bytes_to_unicode():
|
|||
|
||||
The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
|
||||
if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
|
||||
decent coverage. This is a signficant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
|
||||
decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
|
||||
tables between utf-8 bytes and unicode strings.
|
||||
"""
|
||||
bs = (
|
||||
|
@ -189,7 +189,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
|
|||
self.cache = {}
|
||||
self.add_prefix_space = add_prefix_space
|
||||
|
||||
# Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
|
||||
# Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
|
||||
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
|
||||
|
||||
@property
|
||||
|
@ -247,7 +247,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
|
|||
for token in re.findall(self.pat, text):
|
||||
token = "".join(
|
||||
self.byte_encoder[b] for b in token.encode("utf-8")
|
||||
) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
|
||||
) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
|
||||
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
|
||||
return bpe_tokens
|
||||
|
||||
|
|
|
@ -221,7 +221,7 @@ class GPTNeoAttentionMixin:
|
|||
if attention_mask is None:
|
||||
attention_mask = torch.ones(batch_size, seq_length, dtype=torch.long, device=device)
|
||||
|
||||
# A block can also be padded becuase of the _look_back operation
|
||||
# A block can also be padded because of the _look_back operation
|
||||
# look back into the attention_block such that it will also get padded the same way
|
||||
# and have 0s in the padded position
|
||||
attention_mask = GPTNeoAttentionMixin._look_back(attention_mask, block_length, window_size, is_key_value=False)
|
||||
|
@ -804,8 +804,8 @@ class GPTNeoModel(GPTNeoPreTrainedModel):
|
|||
|
||||
# Prepare head mask if needed
|
||||
# 1.0 in head_mask indicate we keep the head
|
||||
# attention_probs has shape bsz x num_headss x N x N
|
||||
# head_mask has shape n_layer x batch x num_headss x N x N
|
||||
# attention_probs has shape bsz x num_heads x N x N
|
||||
# head_mask has shape n_layer x batch x num_heads x N x N
|
||||
head_mask = self.get_head_mask(head_mask, self.config.num_layers)
|
||||
|
||||
if inputs_embeds is None:
|
||||
|
|
|
@ -35,11 +35,11 @@ class QuantEmbedding(nn.Module):
|
|||
:obj:`torch.nn.Embedding`.
|
||||
|
||||
Args:
|
||||
weight_bit (:obj:`int`, `optiona`l, defaults to :obj:`8`):
|
||||
weight_bit (:obj:`int`, `optional`, defaults to :obj:`8`):
|
||||
Bitwidth for the quantized weight.
|
||||
momentum (:obj:`float`, `optional, defaults to :obj:`0.95`):
|
||||
momentum (:obj:`float`, `optional`, defaults to :obj:`0.95`):
|
||||
Momentum for updating the activation quantization range.
|
||||
quant_mode (:obj:`bool`, `optional, defaults to :obj:`False`):
|
||||
quant_mode (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not the layer is quantized.
|
||||
"""
|
||||
|
||||
|
@ -619,7 +619,7 @@ def symmetric_linear_quantization_params(num_bits, saturation_min, saturation_ma
|
|||
`saturation_max`.
|
||||
"""
|
||||
# in this part, we do not need any gradient computation,
|
||||
# in order to enfore this, we put torch.no_grad()
|
||||
# in order to enforce this, we put torch.no_grad()
|
||||
with torch.no_grad():
|
||||
n = 2 ** (num_bits - 1) - 1
|
||||
|
||||
|
|
|
@ -842,9 +842,9 @@ class LEDDecoderAttention(nn.Module):
|
|||
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
|
||||
|
||||
if output_attentions:
|
||||
# this operation is a bit akward, but it's required to
|
||||
# this operation is a bit awkward, but it's required to
|
||||
# make sure that attn_weights keeps its gradient.
|
||||
# In order to do so, attn_weights have to reshaped
|
||||
# In order to do so, attn_weights have to be reshaped
|
||||
# twice and have to be reused in the following
|
||||
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
|
||||
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
|
||||
|
@ -1499,7 +1499,7 @@ LED_INPUTS_DOCSTRING = r"""
|
|||
Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0, 1]``:
|
||||
|
||||
- 1 indicates the head is **not masked**,
|
||||
- 0 indicates the heas is **masked**.
|
||||
- 0 indicates the head is **masked**.
|
||||
|
||||
decoder_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
|
||||
Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in ``[0, 1]``:
|
||||
|
@ -1689,7 +1689,7 @@ class LEDEncoder(LEDPreTrainedModel):
|
|||
Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
|
||||
|
||||
- 1 indicates the head is **not masked**,
|
||||
- 0 indicates the heas is **masked**.
|
||||
- 0 indicates the head is **masked**.
|
||||
inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
|
||||
Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
|
||||
representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
|
||||
|
@ -1920,7 +1920,7 @@ class LEDDecoder(LEDPreTrainedModel):
|
|||
Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
|
||||
|
||||
- 1 indicates the head is **not masked**,
|
||||
- 0 indicates the heas is **masked**.
|
||||
- 0 indicates the head is **masked**.
|
||||
|
||||
cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
|
||||
Mask to nullify selected heads of the cross-attention modules. Mask values selected in ``[0, 1]``:
|
||||
|
|
|
@ -869,7 +869,7 @@ class TFLEDEncoderSelfAttention(tf.keras.layers.Layer):
|
|||
# compute global attn probs
|
||||
global_attn_probs_float = tf.nn.softmax(global_attn_scores, axis=-1)
|
||||
|
||||
# apply layer head maskin
|
||||
# apply layer head masking
|
||||
if layer_head_mask is not None:
|
||||
if tf.executing_eagerly():
|
||||
tf.debugging.assert_equal(
|
||||
|
@ -1552,7 +1552,7 @@ LED_INPUTS_DOCSTRING = r"""
|
|||
Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0, 1]``:
|
||||
|
||||
- 1 indicates the head is **not masked**,
|
||||
- 0 indicates the heas is **masked**.
|
||||
- 0 indicates the head is **masked**.
|
||||
|
||||
decoder_head_mask (:obj:`tf.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
|
||||
Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in ``[0, 1]``:
|
||||
|
@ -1667,7 +1667,7 @@ class TFLEDEncoder(tf.keras.layers.Layer):
|
|||
Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
|
||||
|
||||
- 1 indicates the head is **not masked**,
|
||||
- 0 indicates the heas is **masked**.
|
||||
- 0 indicates the head is **masked**.
|
||||
|
||||
inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
|
||||
Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
|
||||
|
@ -1926,14 +1926,14 @@ class TFLEDDecoder(tf.keras.layers.Layer):
|
|||
Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
|
||||
|
||||
- 1 indicates the head is **not masked**,
|
||||
- 0 indicates the heas is **masked**.
|
||||
- 0 indicates the head is **masked**.
|
||||
|
||||
encoder_head_mask (:obj:`tf.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`):
|
||||
Mask to nullify selected heads of the attention modules in encoder to avoid performing cross-attention
|
||||
on hidden heads. Mask values selected in ``[0, 1]``:
|
||||
|
||||
- 1 indicates the head is **not masked**,
|
||||
- 0 indicates the heas is **masked**.
|
||||
- 0 indicates the head is **masked**.
|
||||
|
||||
past_key_values (:obj:`Tuple[Tuple[tf.Tensor]]` of length :obj:`config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
|
||||
Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up
|
||||
|
|
|
@ -393,7 +393,7 @@ class LongformerTokenClassifierOutput(ModelOutput):
|
|||
|
||||
def _get_question_end_index(input_ids, sep_token_id):
|
||||
"""
|
||||
Computes the index of the first occurance of `sep_token_id`.
|
||||
Computes the index of the first occurrence of `sep_token_id`.
|
||||
"""
|
||||
|
||||
sep_token_indices = (input_ids == sep_token_id).nonzero()
|
||||
|
@ -1428,7 +1428,7 @@ LONGFORMER_INPUTS_DOCSTRING = r"""
|
|||
Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0, 1]``:
|
||||
|
||||
- 1 indicates the head is **not masked**,
|
||||
- 0 indicates the heas is **masked**.
|
||||
- 0 indicates the head is **masked**.
|
||||
|
||||
decoder_head_mask (:obj:`torch.Tensor` of shape :obj:`(num_layers, num_heads)`, `optional`):
|
||||
Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in ``[0, 1]``:
|
||||
|
|
|
@ -1388,7 +1388,7 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
|
|||
# compute global attn probs
|
||||
global_attn_probs_float = tf.nn.softmax(global_attn_scores, axis=-1)
|
||||
|
||||
# apply layer head maskin
|
||||
# apply layer head masking
|
||||
if layer_head_mask is not None:
|
||||
if tf.executing_eagerly():
|
||||
tf.debugging.assert_equal(
|
||||
|
@ -1707,7 +1707,7 @@ class TFLongformerMainLayer(tf.keras.layers.Layer):
|
|||
inputs["attention_mask"], (attention_mask_shape[0], attention_mask_shape[1], 1, 1)
|
||||
)
|
||||
|
||||
# Since attention_mask is 1.0 for positions we want to locall attend locally and 0.0 for
|
||||
# Since attention_mask is 1.0 for positions we want to attend locally and 0.0 for
|
||||
# masked and global attn positions, this operation will create a tensor which is 0.0 for
|
||||
# positions we want to attend and -10000.0 for masked positions.
|
||||
# Since we are adding it to the raw scores before the softmax, this is
|
||||
|
@ -1920,7 +1920,7 @@ LONGFORMER_INPUTS_DOCSTRING = r"""
|
|||
Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
|
||||
|
||||
- 1 indicates the head is **not masked**,
|
||||
- 0 indicates the heas is **masked**.
|
||||
- 0 indicates the head is **masked**.
|
||||
|
||||
global_attention_mask (:obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
|
||||
Mask to decide the attention given on each token, local attention or global attention. Tokens with global
|
||||
|
|
|
@ -95,9 +95,9 @@ class LxmertConfig(PretrainedConfig):
|
|||
Whether or not to add masked language modeling (as used in pretraining models such as BERT) to the loss
|
||||
objective.
|
||||
task_obj_predict (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not to add object prediction, attribute ppredictionand feature regression to the loss objective.
|
||||
Whether or not to add object prediction, attribute prediction and feature regression to the loss objective.
|
||||
task_qa (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not to add the question-asansweringoss to the objective
|
||||
Whether or not to add the question-answering loss to the objective
|
||||
visual_obj_loss (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not to calculate the object-prediction loss objective
|
||||
visual_attr_loss (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
|
|
|
@ -306,9 +306,9 @@ class M2M100Attention(nn.Module):
|
|||
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
|
||||
|
||||
if output_attentions:
|
||||
# this operation is a bit akward, but it's required to
|
||||
# this operation is a bit awkward, but it's required to
|
||||
# make sure that attn_weights keeps its gradient.
|
||||
# In order to do so, attn_weights have to reshaped
|
||||
# In order to do so, attn_weights have to be reshaped
|
||||
# twice and have to be reused in the following
|
||||
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
|
||||
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
|
||||
|
|
|
@ -252,9 +252,9 @@ class MarianAttention(nn.Module):
|
|||
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
|
||||
|
||||
if output_attentions:
|
||||
# this operation is a bit akward, but it's required to
|
||||
# this operation is a bit awkward, but it's required to
|
||||
# make sure that attn_weights keeps its gradient.
|
||||
# In order to do so, attn_weights have to reshaped
|
||||
# In order to do so, attn_weights have to be reshaped
|
||||
# twice and have to be reused in the following
|
||||
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
|
||||
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
|
||||
|
|
|
@ -117,7 +117,7 @@ class MBartLearnedPositionalEmbedding(nn.Embedding):
|
|||
|
||||
def __init__(self, num_embeddings: int, embedding_dim: int):
|
||||
# MBart is set up so that if padding_idx is specified then offset the embedding ids by 2
|
||||
# and adjust num_embeddings appropriately. Other models dont have this hack
|
||||
# and adjust num_embeddings appropriately. Other models don't have this hack
|
||||
self.offset = 2
|
||||
super().__init__(num_embeddings + self.offset, embedding_dim)
|
||||
|
||||
|
@ -243,9 +243,9 @@ class MBartAttention(nn.Module):
|
|||
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
|
||||
|
||||
if output_attentions:
|
||||
# this operation is a bit akward, but it's required to
|
||||
# this operation is a bit awkward, but it's required to
|
||||
# make sure that attn_weights keeps its gradient.
|
||||
# In order to do so, attn_weights have to reshaped
|
||||
# In order to do so, attn_weights have to be reshaped
|
||||
# twice and have to be reused in the following
|
||||
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
|
||||
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
|
||||
|
|
|
@ -118,7 +118,7 @@ class TFMBartLearnedPositionalEmbedding(TFSharedEmbeddings):
|
|||
|
||||
def __init__(self, num_embeddings: int, embedding_dim: int, **kwargs):
|
||||
# MBart is set up so that if padding_idx is specified then offset the embedding ids by 2
|
||||
# and adjust num_embeddings appropriately. Other models dont have this hack
|
||||
# and adjust num_embeddings appropriately. Other models don't have this hack
|
||||
self.offset = 2
|
||||
super().__init__(num_embeddings + self.offset, embedding_dim, **kwargs)
|
||||
|
||||
|
@ -690,7 +690,7 @@ class TFMBartEncoder(tf.keras.layers.Layer):
|
|||
Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
|
||||
|
||||
- 1 indicates the head is **not masked**,
|
||||
- 0 indicates the heas is **masked**.
|
||||
- 0 indicates the head is **masked**.
|
||||
|
||||
inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
|
||||
Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
|
||||
|
|
|
@ -1487,7 +1487,7 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel):
|
|||
|
||||
@add_start_docstrings(
|
||||
"""
|
||||
MoibleBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
|
||||
MobileBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
|
||||
for Named-Entity-Recognition (NER) tasks.
|
||||
""",
|
||||
MOBILEBERT_START_DOCSTRING,
|
||||
|
|
|
@ -674,7 +674,7 @@ MPNET_INPUTS_DOCSTRING = r"""
|
|||
|
||||
|
||||
@add_start_docstrings(
|
||||
"The bare MPNet Model transformer outputing raw hidden-states without any specific head on top.",
|
||||
"The bare MPNet Model transformer outputting raw hidden-states without any specific head on top.",
|
||||
MPNET_START_DOCSTRING,
|
||||
)
|
||||
class TFMPNetModel(TFMPNetPreTrainedModel):
|
||||
|
|
|
@ -154,7 +154,7 @@ class MPNetTokenizerFast(PreTrainedTokenizerFast):
|
|||
:obj:`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
|
||||
not having been set.
|
||||
|
||||
MPNet tokenizer has a special mask token to be usble in the fill-mask pipeline. The mask token will greedily
|
||||
MPNet tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
|
||||
comprise the space before the `<mask>`.
|
||||
"""
|
||||
if self._mask_token is None and self.verbose:
|
||||
|
|
|
@ -146,7 +146,7 @@ class Attention(nn.Module):
|
|||
def __init__(self, nx, n_ctx, config, scale=False):
|
||||
super().__init__()
|
||||
n_state = nx # in Attention: n_state=768 (nx=n_embd)
|
||||
# [switch nx => n_state from Block to Attention to keep identical to TF implem]
|
||||
# [switch nx => n_state from Block to Attention to keep identical to TF implementation]
|
||||
assert n_state % config.n_head == 0
|
||||
self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
|
||||
self.n_head = config.n_head
|
||||
|
@ -178,7 +178,7 @@ class Attention(nn.Module):
|
|||
w = torch.matmul(q, k)
|
||||
if self.scale:
|
||||
w = w / math.sqrt(v.size(-1))
|
||||
# w = w * self.bias + -1e9 * (1 - self.bias) # TF implem method: mask_attn_weights
|
||||
# w = w * self.bias + -1e9 * (1 - self.bias) # TF implementation method: mask_attn_weights
|
||||
# XD: self.b may be larger than w, so we need to crop it
|
||||
b = self.bias[:, :, : w.size(-2), : w.size(-1)]
|
||||
w = w * b + -1e4 * (1 - b)
|
||||
|
@ -202,11 +202,11 @@ class Attention(nn.Module):
|
|||
def merge_heads(self, x):
|
||||
x = x.permute(0, 2, 1, 3).contiguous()
|
||||
new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
|
||||
return x.view(*new_x_shape) # in Tensorflow implem: fct merge_states
|
||||
return x.view(*new_x_shape) # in Tensorflow implementation: fct merge_states
|
||||
|
||||
def split_heads(self, x, k=False):
|
||||
new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
|
||||
x = x.view(*new_x_shape) # in Tensorflow implem: fct split_states
|
||||
x = x.view(*new_x_shape) # in Tensorflow implementation: fct split_states
|
||||
if k:
|
||||
return x.permute(0, 2, 3, 1)
|
||||
else:
|
||||
|
@ -467,7 +467,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
|
|||
raise ValueError("You have to specify either input_ids or inputs_embeds")
|
||||
|
||||
if position_ids is None:
|
||||
# Code is different from when we had a single embedding matrice from position and token embeddings
|
||||
# Code is different from when we had a single embedding matrix from position and token embeddings
|
||||
position_ids = self.position_ids[None, : input_shape[-1]]
|
||||
|
||||
# Attention mask.
|
||||
|
@ -814,7 +814,7 @@ class OpenAIGPTForSequenceClassification(OpenAIGPTPreTrainedModel):
|
|||
sequence_lengths = -1
|
||||
logger.warning(
|
||||
f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
|
||||
f"unexpected if using padding tokens in conjuction with `inputs_embeds.`"
|
||||
f"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
|
||||
)
|
||||
|
||||
pooled_logits = logits[range(batch_size), sequence_lengths]
|
||||
|
|
|
@ -62,7 +62,7 @@ class TFAttention(tf.keras.layers.Layer):
|
|||
super().__init__(**kwargs)
|
||||
|
||||
n_state = nx # in Attention: n_state=768 (nx=n_embd)
|
||||
# [switch nx => n_state from Block to Attention to keep identical to TF implem]
|
||||
# [switch nx => n_state from Block to Attention to keep identical to TF implementation]
|
||||
assert (
|
||||
n_state % config.n_head == 0
|
||||
), f"Hidden dimension {n_state} not dividable by number of heads {config.n_head}"
|
||||
|
|
|
@ -252,9 +252,9 @@ class PegasusAttention(nn.Module):
|
|||
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
|
||||
|
||||
if output_attentions:
|
||||
# this operation is a bit akward, but it's required to
|
||||
# this operation is a bit awkward, but it's required to
|
||||
# make sure that attn_weights keeps its gradient.
|
||||
# In order to do so, attn_weights have to reshaped
|
||||
# In order to do so, attn_weights have to be reshaped
|
||||
# twice and have to be reused in the following
|
||||
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
|
||||
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
|
||||
|
|
|
@ -719,7 +719,7 @@ class TFPegasusEncoder(tf.keras.layers.Layer):
|
|||
Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
|
||||
|
||||
- 1 indicates the head is **not masked**,
|
||||
- 0 indicates the heas is **masked**.
|
||||
- 0 indicates the head is **masked**.
|
||||
|
||||
inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
|
||||
Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
|
||||
|
|
|
@ -723,9 +723,9 @@ class ProphetNetAttention(nn.Module):
|
|||
attn_weights = attn_weights + attention_mask
|
||||
|
||||
if output_attentions:
|
||||
# this operation is a bit akward, but it's required to
|
||||
# this operation is a bit awkward, but it's required to
|
||||
# make sure that attn_weights keeps its gradient.
|
||||
# In order to do so, attn_weights have to reshaped
|
||||
# In order to do so, attn_weights have to be reshaped
|
||||
# twice and have to be reused in the following
|
||||
attn_weights_reshaped = attn_weights.view(batch_size, self.num_attn_heads, tgt_len, src_len)
|
||||
attn_weights = attn_weights_reshaped.view(batch_size * self.num_attn_heads, tgt_len, src_len)
|
||||
|
@ -1243,7 +1243,7 @@ class ProphetNetEncoder(ProphetNetPreTrainedModel):
|
|||
r"""
|
||||
word_embeddings (:obj:`torch.nn.Embeddings` of shape :obj:`(config.vocab_size, config.hidden_size)`, `optional`):
|
||||
The word embedding parameters. This can be used to initialize :class:`~transformers.ProphetNetEncoder` with
|
||||
pre-defined word embeddings instead of randomely initialized word embeddings.
|
||||
pre-defined word embeddings instead of randomly initialized word embeddings.
|
||||
"""
|
||||
|
||||
def __init__(self, config: ProphetNetConfig, word_embeddings: nn.Embedding = None):
|
||||
|
@ -1380,7 +1380,7 @@ class ProphetNetDecoder(ProphetNetPreTrainedModel):
|
|||
r"""
|
||||
word_embeddings (:obj:`torch.nn.Embeddings` of shape :obj:`(config.vocab_size, config.hidden_size)`, `optional`):
|
||||
The word embedding parameters. This can be used to initialize :class:`~transformers.ProphetNetEncoder` with
|
||||
pre-defined word embeddings instead of randomely initialized word embeddings.
|
||||
pre-defined word embeddings instead of randomly initialized word embeddings.
|
||||
"""
|
||||
|
||||
def __init__(self, config: ProphetNetConfig, word_embeddings: nn.Embedding = None):
|
||||
|
|
|
@ -285,7 +285,7 @@ class TFRagPreTrainedModel(TFPreTrainedModel):
|
|||
|
||||
>>> # load retriever
|
||||
>>> retriever = RagRetriever.from_pretrained(PATH, index_name="exact", use_dummy_dataset=True)
|
||||
>>> # load fine-tuned model with retriver
|
||||
>>> # load fine-tuned model with retriever
|
||||
>>> model = TFRagModel.from_pretrained("./rag", retriever=retriever)
|
||||
"""
|
||||
|
||||
|
|
|
@ -234,7 +234,7 @@ class CanonicalHFIndex(HFIndexBase):
|
|||
Args:
|
||||
vector_size (:obj:`int`): the dimension of the passages embeddings used by the index
|
||||
dataset_name (:obj:`str`, optional, defaults to ``wiki_dpr``):
|
||||
A datatset identifier of the indexed dataset on HuggingFace AWS bucket (list all available datasets and ids
|
||||
A dataset identifier of the indexed dataset on HuggingFace AWS bucket (list all available datasets and ids
|
||||
with ``datasets.list_datasets()``).
|
||||
dataset_split (:obj:`str`, optional, defaults to ``train``)
|
||||
Which split of the ``dataset`` to load.
|
||||
|
@ -442,7 +442,7 @@ class RagRetriever:
|
|||
|
||||
def init_retrieval(self):
|
||||
"""
|
||||
Retriever initalization function. It loads the index into memory.
|
||||
Retriever initialization function. It loads the index into memory.
|
||||
"""
|
||||
|
||||
logger.info("initializing retrieval")
|
||||
|
|
|
@ -612,7 +612,7 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
|
|||
if isinstance(self.num_buckets, int):
|
||||
assert (
|
||||
self.num_buckets % 2 == 0
|
||||
), f"There should be an even number of bucktes, but `self.num_bucktes`: {self.num_buckets}"
|
||||
), f"There should be an even number of buckets, but `self.num_buckets`: {self.num_buckets}"
|
||||
rotation_size = self.num_buckets
|
||||
num_buckets = self.num_buckets
|
||||
else:
|
||||
|
|
|
@ -179,7 +179,7 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
|
|||
:obj:`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
|
||||
not having been set.
|
||||
|
||||
Roberta tokenizer has a special mask token to be usble in the fill-mask pipeline. The mask token will greedily
|
||||
Roberta tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
|
||||
comprise the space before the `<mask>`.
|
||||
"""
|
||||
if self._mask_token is None and self.verbose:
|
||||
|
|
|
@ -319,9 +319,9 @@ class Speech2TextAttention(nn.Module):
|
|||
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
|
||||
|
||||
if output_attentions:
|
||||
# this operation is a bit akward, but it's required to
|
||||
# this operation is a bit awkward, but it's required to
|
||||
# make sure that attn_weights keeps its gradient.
|
||||
# In order to do so, attn_weights have to reshaped
|
||||
# In order to do so, attn_weights have to be reshaped
|
||||
# twice and have to be reused in the following
|
||||
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
|
||||
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
|
||||
|
@ -559,7 +559,7 @@ class Speech2TextPreTrainedModel(PreTrainedModel):
|
|||
return input_lengths
|
||||
|
||||
def _get_subsampled_encoder_attn_mask(self, attention_mask):
|
||||
# generate creates 3D attention mask, becuase of the shape of input_features
|
||||
# generate creates 3D attention mask, because of the shape of input_features
|
||||
# convert it to 2D if thats the case
|
||||
if len(attention_mask.shape) > 2:
|
||||
attention_mask = attention_mask[:, :, -1]
|
||||
|
|
|
@ -1172,7 +1172,7 @@ T5_ENCODER_INPUTS_DOCSTRING = r"""
|
|||
Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
|
||||
"""
|
||||
|
||||
# Warning messafe for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
|
||||
# Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
|
||||
__HEAD_MASK_WARNING_MSG = """
|
||||
The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently,
|
||||
`decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions.
|
||||
|
|
|
@ -637,7 +637,7 @@ class TFT5MainLayer(tf.keras.layers.Layer):
|
|||
raise ValueError(f"You have to specify either {err_msg_prefix}inputs or {err_msg_prefix}inputs_embeds")
|
||||
|
||||
if inputs["inputs_embeds"] is None:
|
||||
assert self.embed_tokens is not None, "You have to intialize the model with valid token embeddings"
|
||||
assert self.embed_tokens is not None, "You have to initialize the model with valid token embeddings"
|
||||
inputs["inputs_embeds"] = self.embed_tokens(inputs["input_ids"])
|
||||
|
||||
batch_size, seq_length = input_shape
|
||||
|
|
|
@ -2021,7 +2021,7 @@ def _calculate_aggregate_mask(answer, pooled_output, cell_selection_preference,
|
|||
apply to numbers. If the answer is a number but does not appear in the table then we must use some aggregation
|
||||
case. The ambiguous case is when the answer is a number that also appears in the table. In this case we use the
|
||||
aggregation function probabilities predicted by the model to decide whether to select or aggregate. The threshold
|
||||
for this is a hyperparameter `cell_selection_preference
|
||||
for this is a hyperparameter `cell_selection_preference`
|
||||
|
||||
Args:
|
||||
answer (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, )`):
|
||||
|
@ -2041,7 +2041,7 @@ def _calculate_aggregate_mask(answer, pooled_output, cell_selection_preference,
|
|||
aggregate_mask_init = torch.logical_not(torch.isnan(answer)).type(torch.FloatTensor).to(answer.device)
|
||||
logits_aggregation = aggregation_classifier(pooled_output)
|
||||
dist_aggregation = torch.distributions.categorical.Categorical(logits=logits_aggregation)
|
||||
# Index 0 correponds to "no aggregation".
|
||||
# Index 0 corresponds to "no aggregation".
|
||||
aggregation_ops_total_mass = torch.sum(dist_aggregation.probs[:, 1:], dim=1)
|
||||
|
||||
# Cell selection examples according to current model.
|
||||
|
@ -2126,7 +2126,7 @@ def _calculate_aggregation_loss_unknown(logits_aggregation, aggregate_mask):
|
|||
answer supervision) per example.
|
||||
"""
|
||||
dist_aggregation = torch.distributions.categorical.Categorical(logits=logits_aggregation)
|
||||
# Index 0 correponds to "no aggregation".
|
||||
# Index 0 corresponds to "no aggregation".
|
||||
aggregation_ops_total_mass = torch.sum(dist_aggregation.probs[:, 1:], dim=1)
|
||||
# Predict some aggregation in case of an answer that needs aggregation.
|
||||
# This increases the probability of all aggregation functions, in a way
|
||||
|
|
|
@ -2357,7 +2357,7 @@ def _get_numeric_value_from_date(date, mask):
|
|||
|
||||
|
||||
def _get_span_length_key(span):
|
||||
"""Sorts span by decreasing length first and incresing first index second."""
|
||||
"""Sorts span by decreasing length first and increasing first index second."""
|
||||
return span[1] - span[0], -span[0]
|
||||
|
||||
|
||||
|
|
|
@ -292,7 +292,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
|
|||
elif "<unk>" in self.sym2idx:
|
||||
self.unk_idx = self.sym2idx["<unk>"]
|
||||
else:
|
||||
raise ValueError("No <unkown> token in vocabulary")
|
||||
raise ValueError("No <unknown> token in vocabulary")
|
||||
|
||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||
if os.path.isdir(save_directory):
|
||||
|
|
|
@ -382,9 +382,9 @@ class Wav2Vec2Attention(nn.Module):
|
|||
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
|
||||
|
||||
if output_attentions:
|
||||
# this operation is a bit akward, but it's required to
|
||||
# this operation is a bit awkward, but it's required to
|
||||
# make sure that attn_weights keeps its gradient.
|
||||
# In order to do so, attn_weights have to reshaped
|
||||
# In order to do so, attn_weights have to be reshaped
|
||||
# twice and have to be reused in the following
|
||||
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
|
||||
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
|
||||
|
|
|
@ -111,7 +111,7 @@ class Wav2Vec2Processor:
|
|||
When used in normal mode, this method forwards all its arguments to Wav2Vec2FeatureExtractor's
|
||||
:meth:`~transformers.Wav2Vec2FeatureExtractor.__call__` and returns its output. If used in the context
|
||||
:meth:`~transformers.Wav2Vec2Processor.as_target_processor` this method forwards all its arguments to
|
||||
Wav2Vec2CTCTokenizer's :meth:`~transformers.Wav2Vec2CTCTokenizer.__call__`. Please refer to the doctsring of
|
||||
Wav2Vec2CTCTokenizer's :meth:`~transformers.Wav2Vec2CTCTokenizer.__call__`. Please refer to the docstring of
|
||||
the above two methods for more information.
|
||||
"""
|
||||
return self.current_processor(*args, **kwargs)
|
||||
|
|
|
@ -348,7 +348,7 @@ XLM_INPUTS_DOCSTRING = r"""
|
|||
A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
|
||||
languages ids which can be obtained from the language names by using two conversion mappings provided in
|
||||
the configuration of the model (only provided for multilingual models). More precisely, the `language name
|
||||
to language id` mapping is in :obj:`model.config.lang2id` (which is a dictionary strring to int) and the
|
||||
to language id` mapping is in :obj:`model.config.lang2id` (which is a dictionary string to int) and the
|
||||
`language id to language name` mapping is in :obj:`model.config.id2lang` (dictionary int to string).
|
||||
|
||||
See usage examples detailed in the :doc:`multilingual documentation <../multilingual>`.
|
||||
|
@ -1188,7 +1188,7 @@ class XLMForMultipleChoice(XLMPreTrainedModel):
|
|||
|
||||
self.init_weights()
|
||||
|
||||
@add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, num_choicec, sequence_length"))
|
||||
@add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
|
||||
@add_code_sample_docstrings(
|
||||
tokenizer_class=_TOKENIZER_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
|
|
|
@ -749,8 +749,8 @@ class XLMTokenizer(PreTrainedTokenizer):
|
|||
|
||||
def _tokenize(self, text, lang="en", bypass_tokenizer=False):
|
||||
"""
|
||||
Tokenize a string given language code. For Chinese, Japanese and Thai, we use a language specific
|
||||
tokenizerself. Otherwise, we use Moses.
|
||||
Tokenize a string given language code. For Chinese, Japanese and Thai, we use a language specific tokenizer.
|
||||
Otherwise, we use Moses.
|
||||
|
||||
Details of tokenization:
|
||||
|
||||
|
|
|
@ -1113,7 +1113,7 @@ XLNET_INPUTS_DOCSTRING = r"""
|
|||
Mask values selected in ``[0, 1]``:
|
||||
|
||||
- 1 for tokens that are **masked**,
|
||||
- 0 for tokens that are **not maked**.
|
||||
- 0 for tokens that are **not masked**.
|
||||
|
||||
You can only uses one of :obj:`input_mask` and :obj:`attention_mask`.
|
||||
head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
|
||||
|
|
|
@ -182,7 +182,7 @@ class AdamWeightDecay(tf.keras.optimizers.Adam):
|
|||
name (:obj:`str`, `optional`, defaults to 'AdamWeightDecay'):
|
||||
Optional name for the operations created when applying gradients.
|
||||
kwargs:
|
||||
Keyward arguments. Allowed to be {``clipnorm``, ``clipvalue``, ``lr``, ``decay``}. ``clipnorm`` is clip
|
||||
Keyword arguments. Allowed to be {``clipnorm``, ``clipvalue``, ``lr``, ``decay``}. ``clipnorm`` is clip
|
||||
gradients by norm; ``clipvalue`` is clip gradients by value, ``decay`` is included for backward
|
||||
compatibility to allow time inverse decay of learning rate. ``lr`` is included for backward compatibility,
|
||||
recommended to use ``learning_rate`` instead.
|
||||
|
|
|
@ -128,7 +128,7 @@ class Conversation:
|
|||
"""
|
||||
Iterates over all blobs of the conversation.
|
||||
|
||||
Retuns: Iterator of (is_user, text_chunk) in chronological order of the conversation. ``is_user`` is a
|
||||
Returns: Iterator of (is_user, text_chunk) in chronological order of the conversation. ``is_user`` is a
|
||||
:obj:`bool`, ``text_chunks`` is a :obj:`str`.
|
||||
"""
|
||||
for user_input, generated_response in zip(self.past_user_inputs, self.generated_responses):
|
||||
|
|
|
@ -48,7 +48,7 @@ class Text2TextGenerationPipeline(Pipeline):
|
|||
|
||||
def check_inputs(self, input_length: int, min_length: int, max_length: int):
|
||||
"""
|
||||
Checks wether there might be something wrong with given input with regard to the model.
|
||||
Checks whether there might be something wrong with given input with regard to the model.
|
||||
"""
|
||||
return True
|
||||
|
||||
|
@ -204,7 +204,7 @@ class SummarizationPipeline(Text2TextGenerationPipeline):
|
|||
|
||||
def check_inputs(self, input_length: int, min_length: int, max_length: int) -> bool:
|
||||
"""
|
||||
Checks wether there might be something wrong with given input with regard to the model.
|
||||
Checks whether there might be something wrong with given input with regard to the model.
|
||||
"""
|
||||
if input_length < min_length // 2:
|
||||
logger.warning(
|
||||
|
|
|
@ -520,7 +520,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
|
|||
filename_prefix: Optional[str] = None,
|
||||
) -> Tuple[str]:
|
||||
"""
|
||||
Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens as well asin a unique JSON
|
||||
Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens as well as in a unique JSON
|
||||
file containing {config + vocab + added-tokens}.
|
||||
"""
|
||||
save_directory = str(save_directory)
|
||||
|
|
|
@ -208,7 +208,7 @@ class Trainer:
|
|||
|
||||
Note that if it's a :obj:`torch.utils.data.dataset.IterableDataset` with some randomization and you are
|
||||
training in a distributed fashion, your iterable dataset should either use a internal attribute
|
||||
:obj:`generator` that is a :obj:`torch.Generator` for the randomization that must be identic on all
|
||||
:obj:`generator` that is a :obj:`torch.Generator` for the randomization that must be identical on all
|
||||
processes (and the Trainer will manually set the seed of this :obj:`generator` at each epoch) or have a
|
||||
:obj:`set_epoch()` method that internally sets the seed of the RNGs used.
|
||||
eval_dataset (:obj:`torch.utils.data.dataset.Dataset`, `optional`):
|
||||
|
|
|
@ -42,7 +42,7 @@ class TrainerState:
|
|||
|
||||
In all this class, one step is to be understood as one update step. When using gradient accumulation, one
|
||||
update step may require several forward and backward passes: if you use :obj:`gradient_accumulation_steps=n`,
|
||||
then one update step requires going throuch `n` batches.
|
||||
then one update step requires going through `n` batches.
|
||||
|
||||
Args:
|
||||
epoch (:obj:`float`, `optional`):
|
||||
|
|
|
@ -243,7 +243,7 @@ class SequentialDistributedSampler(Sampler):
|
|||
|
||||
def __init__(self, dataset, num_replicas=None, rank=None, batch_size=None):
|
||||
warnings.warn(
|
||||
"SequentialDistributedSampler is deprecated and will be removed in v5 of Tranformers.",
|
||||
"SequentialDistributedSampler is deprecated and will be removed in v5 of Transformers.",
|
||||
FutureWarning,
|
||||
)
|
||||
if num_replicas is None:
|
||||
|
@ -363,7 +363,7 @@ class DistributedTensorGatherer:
|
|||
|
||||
def __init__(self, world_size, num_samples, make_multiple_of=None, padding_index=-100):
|
||||
warnings.warn(
|
||||
"DistributedTensorGatherer is deprecated and will be removed in v5 of Tranformers.",
|
||||
"DistributedTensorGatherer is deprecated and will be removed in v5 of Transformers.",
|
||||
FutureWarning,
|
||||
)
|
||||
self.world_size = world_size
|
||||
|
|
|
@ -199,7 +199,7 @@ class Seq2SeqTrainer(Trainer):
|
|||
def _pad_tensors_to_max_len(self, tensor, max_length):
|
||||
if self.tokenizer is None:
|
||||
raise ValueError(
|
||||
f"Tensor need to be padded to `max_length={max_length}` but no tokenzier was passed when creating "
|
||||
f"Tensor need to be padded to `max_length={max_length}` but no tokenizer was passed when creating "
|
||||
"this `Trainer`. Make sure to create your `Trainer` with the appropriate tokenizer."
|
||||
)
|
||||
# If PAD token is not defined at least EOS token has to be defined
|
||||
|
|
|
@ -140,7 +140,7 @@ def get_verbosity() -> int:
|
|||
|
||||
def set_verbosity(verbosity: int) -> None:
|
||||
"""
|
||||
Set the vebosity level for the 🤗 Transformers's root logger.
|
||||
Set the verbosity level for the 🤗 Transformers's root logger.
|
||||
|
||||
Args:
|
||||
verbosity (:obj:`int`):
|
||||
|
|
Loading…
Reference in New Issue