fix some typos in docs, comments, logging/errors (#11432)

This commit is contained in:
LSinev 2021-04-26 16:14:25 +03:00 committed by GitHub
parent e3e70f9551
commit b24ead87e1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
77 changed files with 142 additions and 142 deletions

View File

@ -57,14 +57,14 @@ class AddNewModelCommand(BaseTransformersCLICommand):
if not _has_cookiecutter:
raise ImportError(
"Model creation dependencies are required to use the `add_new_model` command. Install them by running "
"the folowing at the root of your `transformers` clone:\n\n\t$ pip install -e .[modelcreation]\n"
"the following at the root of your `transformers` clone:\n\n\t$ pip install -e .[modelcreation]\n"
)
# Ensure that there is no other `cookiecutter-template-xxx` directory in the current working directory
directories = [directory for directory in os.listdir() if "cookiecutter-template-" == directory[:22]]
if len(directories) > 0:
raise ValueError(
"Several directories starting with `cookiecutter-template-` in current working directory. "
"Please clean your directory by removing all folders startign with `cookiecutter-template-` or "
"Please clean your directory by removing all folders starting with `cookiecutter-template-` or "
"change your working directory."
)

View File

@ -244,7 +244,7 @@ def squad_convert_example_to_features(
cls_index = span["input_ids"].index(tokenizer.cls_token_id)
# p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
# Original TF implem also keep the classification token (set to 0)
# Original TF implementation also keep the classification token (set to 0)
p_mask = np.ones_like(span["token_type_ids"])
if tokenizer.padding_side == "right":
p_mask[len(truncated_query) + sequence_added_tokens :] = 0

View File

@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Sequence feature extraction class for common feature extrcactors to preprocess sequences.
Sequence feature extraction class for common feature extractors to preprocess sequences.
"""
from typing import Dict, List, Optional, Union

View File

@ -551,7 +551,7 @@ BACKENDS_MAPPING = OrderedDict(
("sklearn", (is_sklearn_available, SKLEARN_IMPORT_ERROR)),
("speech", (is_speech_available, SPEECH_IMPORT_ERROR)),
("tf", (is_tf_available, TENSORFLOW_IMPORT_ERROR)),
("tokenziers", (is_tokenizers_available, TOKENIZERS_IMPORT_ERROR)),
("tokenizers", (is_tokenizers_available, TOKENIZERS_IMPORT_ERROR)),
("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)),
("vision", (is_vision_available, VISION_IMPORT_ERROR)),
]

View File

@ -446,7 +446,7 @@ class NoBadWordsLogitsProcessor(LogitsProcessor):
class PrefixConstrainedLogitsProcessor(LogitsProcessor):
r"""
:class:`transformers.LogitsProcessor` that enforces contrained generation and is useful for prefix-conditioned
:class:`transformers.LogitsProcessor` that enforces constrained generation and is useful for prefix-conditioned
constrained generation. See `Autoregressive Entity Retrieval <https://arxiv.org/abs/2010.00904>`__ for more
information.

View File

@ -23,7 +23,7 @@ STOPPING_CRITERIA_INPUTS_DOCSTRING = r"""
Prediction scores of a language modeling head. These can be scores for each vocabulary token before SoftMax
or scores for each vocabulary token after SoftMax.
kwargs:
Additional stopping critera specific kwargs.
Additional stopping criteria specific kwargs.
Return:
:obj:`bool`. :obj:`False` indicates we should continue, :obj:`True` indicates we should stop.

View File

@ -442,8 +442,8 @@ class TFGenerationMixin:
**kwargs
):
"""
Generate sequences for each example without beam search (num_beams == 1). All returned sequence are generated
independantly.
Generate sequences for each example without beam search (num_beams == 1). All returned sequences are generated
independently.
"""
# length of generated sentences / unfinished sentences

View File

@ -821,7 +821,7 @@ class GenerationMixin:
... "at least two people were killed in a suspected bomb attack on a passenger bus "
... "in the strife-torn southern philippines on monday , the military said."
... )
>>> # encode input contex
>>> # encode input context
>>> input_ids = tokenizer(document, return_tensors="pt").input_ids
>>> # generate 3 independent sequences using beam search decoding (5 beams)
>>> # with T5 encoder-decoder model conditioned on short news article.

View File

@ -94,7 +94,7 @@ class FlaxPreTrainedModel(PushToHubMixin):
self.key = PRNGKey(seed)
self.dtype = dtype
# randomely initialized parameters
# randomly initialized parameters
random_params = self.init_weights(self.key, input_shape)
# save required_params as set

View File

@ -343,7 +343,7 @@ class CausalLMOutputWithPast(ModelOutput):
Language modeling loss (for next-token prediction).
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (:obj:`tuple(tupel(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
@ -423,7 +423,7 @@ class SequenceClassifierOutputWithPast(ModelOutput):
Classification (or regression if config.num_labels==1) loss.
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
past_key_values (:obj:`tuple(tupel(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`)

View File

@ -51,7 +51,7 @@ def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove="")
) # '_._' is replaced by a level separation (can be used to convert TF2.0 lists in PyTorch nn.ModulesList)
tf_name = re.sub(r"//+", "/", tf_name) # Remove empty levels at the end
tf_name = tf_name.split("/") # Convert from TF2.0 '/' separators to PyTorch '.' separators
# Some weights have a single name withtout "/" such as final_logits_bias in BART
# Some weights have a single name without "/" such as final_logits_bias in BART
if len(tf_name) > 1:
tf_name = tf_name[1:] # Remove level zero

View File

@ -659,7 +659,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
Args:
inputs (:obj:`Dict[str, tf.Tensor]`):
The input of the saved model as a dictionnary of tensors.
The input of the saved model as a dictionary of tensors.
"""
output = self.call(inputs)
@ -944,7 +944,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
vectors from the end. If not provided or :obj:`None`, just returns None
Return:
:obj:`tf.Variable`: Pointer to the resized decoder or None if the output embeddings are differents of the
:obj:`tf.Variable`: Pointer to the resized decoder or None if the output embeddings are different from the
input ones.
"""
new_lm_head_decoder = old_lm_head_decoder

View File

@ -291,7 +291,7 @@ class ModuleUtilsMixin:
The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard).
num_hidden_layers (:obj:`int`):
The number of hidden layers in the model.
is_attention_chunked: (:obj:`bool`, `optional, defaults to :obj:`False`):
is_attention_chunked: (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the attentions scores are computed by chunks or not.
Returns:

View File

@ -716,7 +716,7 @@ AutoModelForPreTraining = auto_class_factory(
"AutoModelForPreTraining", MODEL_FOR_PRETRAINING_MAPPING, head_doc="pretraining"
)
# Private on puprose, the public class will add the deprecation warnings.
# Private on purpose, the public class will add the deprecation warnings.
_AutoModelWithLMHead = auto_class_factory(
"AutoModelWithLMHead", MODEL_WITH_LM_HEAD_MAPPING, head_doc="language modeling"
)

View File

@ -103,7 +103,7 @@ FlaxAutoModelForMaskedLM = auto_class_factory(
)
FlaxAutoModelForSequenceClassification = auto_class_factory(
"AFlaxutoModelForSequenceClassification",
"FlaxAutoModelForSequenceClassification",
FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
head_doc="sequence classification",
)

View File

@ -469,7 +469,7 @@ TFAutoModelForPreTraining = auto_class_factory(
"TFAutoModelForPreTraining", TF_MODEL_FOR_PRETRAINING_MAPPING, head_doc="pretraining"
)
# Private on puprose, the public class will add the deprecation warnings.
# Private on purpose, the public class will add the deprecation warnings.
_TFAutoModelWithLMHead = auto_class_factory(
"TFAutoModelWithLMHead", TF_MODEL_WITH_LM_HEAD_MAPPING, head_doc="language modeling"
)

View File

@ -171,7 +171,7 @@ class BartConfig(PretrainedConfig):
self.gradient_checkpointing = gradient_checkpointing
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
# ensure backward compatibilty for BART CNN models
# ensure backward compatibility for BART CNN models
if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False):
self.forced_bos_token_id = self.bos_token_id
warnings.warn(

View File

@ -111,7 +111,7 @@ class BartLearnedPositionalEmbedding(nn.Embedding):
def __init__(self, num_embeddings: int, embedding_dim: int):
# Bart is set up so that if padding_idx is specified then offset the embedding ids by 2
# and adjust num_embeddings appropriately. Other models dont have this hack
# and adjust num_embeddings appropriately. Other models don't have this hack
self.offset = 2
super().__init__(num_embeddings + self.offset, embedding_dim)
@ -236,9 +236,9 @@ class BartAttention(nn.Module):
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
if output_attentions:
# this operation is a bit akward, but it's required to
# this operation is a bit awkward, but it's required to
# make sure that attn_weights keeps its gradient.
# In order to do so, attn_weights have to reshaped
# In order to do so, attn_weights have to be reshaped
# twice and have to be reused in the following
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)

View File

@ -116,7 +116,7 @@ class TFBartLearnedPositionalEmbedding(TFSharedEmbeddings):
def __init__(self, num_embeddings: int, embedding_dim: int, **kwargs):
# Bart is set up so that if padding_idx is specified then offset the embedding ids by 2
# and adjust num_embeddings appropriately. Other models dont have this hack
# and adjust num_embeddings appropriately. Other models don't have this hack
self.offset = 2
super().__init__(num_embeddings + self.offset, embedding_dim, **kwargs)

View File

@ -304,7 +304,7 @@ class MecabTokenizer:
class CharacterTokenizer:
"""Runs Character tokenziation."""
"""Runs Character tokenization."""
def __init__(self, vocab, unk_token, normalize_text=True):
"""

View File

@ -451,7 +451,7 @@ Twitter-aware tokenizer, designed to be flexible and easy to adapt to new domain
the class Tokenizer.
4. When instantiating Tokenizer objects, there is a single option: preserve_case. By default, it is set to True. If it
is set to False, then the tokenizer will downcase everything except for emoticons.
is set to False, then the tokenizer will lowercase everything except for emoticons.
"""

View File

@ -724,7 +724,7 @@ class BigBirdBlockSparseAttention(nn.Module):
band_product, dim=-1
) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, (5+n_rand_blocks)*to_block_size]
# contibution of sliding keys
# contribution of sliding keys
# [bsz, n_heads, m//from_block_size-4, from_block_size, 3*to_block_size] x [bsz, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1]
context_layer = self.torch_bmm_nd(
attn_weights[:, :, :, :, to_block_size : 4 * to_block_size], exp_blocked_value_matrix, ndim=5
@ -876,7 +876,7 @@ class BigBirdBlockSparseAttention(nn.Module):
attn_probs_view[:, :, q_idx, :, q_idx : q_idx + 3, :] = right_slice.view(
bsz, n_heads, from_block_size, 3, to_block_size
) # inner_band_product
# global keys (correspomding to 1st key block)
# global keys (corresponding to 1st key block)
attention_probs[:, :, 2 * from_block_size : -2 * from_block_size, :to_block_size] = attn_weights[
:, :, :, :, :to_block_size
].view(
@ -946,7 +946,7 @@ class BigBirdBlockSparseAttention(nn.Module):
@staticmethod
def torch_gather_b2(params, indices):
# this operation is equilvalent to tf.gather when batch_dims=2
# this operation is equivalent to tf.gather when batch_dims=2
if params.shape[:2] != indices.shape[:2]:
raise ValueError(
@ -1054,7 +1054,7 @@ class BigBirdBlockSparseAttention(nn.Module):
to_block_size: int. size of block in to sequence.
num_rand_blocks: int. Number of random chunks per row.
last_idx: if -1 then num_rand_blocks blocks chosen anywhere in to sequence,
if positive then num_rand_blocks blocks choosen only upto last_idx.
if positive then num_rand_blocks blocks chosen only up to last_idx.
Returns:
adjacency list of size from_seq_length//from_block_size-2 by num_rand_blocks
@ -1149,7 +1149,7 @@ class BigBirdBlockSparseAttention(nn.Module):
plan_block_length = np.array(plan_from_length) // from_block_size
# till when to follow plan
max_plan_idx = plan_from_length.index(from_seq_length)
# Random Attention adjajency list
# Random Attention adjacency list
rand_attn = [
np.zeros((num_blocks, np.sum(plan_num_rand_blocks[: max_plan_idx + 1])), dtype=np.int32)
for i in range(num_heads)
@ -1246,8 +1246,8 @@ class BigBirdBlockSparseAttention(nn.Module):
Args:
block_id: int. block id of row.
to_start_block_id: int. random attention coloum start id.
to_end_block_id: int. random attention coloum end id.
to_start_block_id: int. random attention column start id.
to_end_block_id: int. random attention column end id.
num_rand_blocks: int. number of random blocks to be selected.
window_block_left: int. number of blocks of window to left of a block.
window_block_right: int. number of blocks of window to right of a block.
@ -1825,7 +1825,7 @@ BIG_BIRD_INPUTS_DOCSTRING = r"""
@dataclass
class BigBirdForPreTrainingOutput(ModelOutput):
"""
Output type of :class:`~transformers.BigBirdtForPreTraining`.
Output type of :class:`~transformers.BigBirdForPreTraining`.
Args:
loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
@ -2941,7 +2941,7 @@ class BigBirdForQuestionAnswering(BigBirdPreTrainedModel):
logits_mask = None
if question_lengths is not None:
# setting lengths logits to `-infi`
# setting lengths logits to `-inf`
logits_mask = self.prepare_question_mask(question_lengths, seqlen)
if token_type_ids is None:
token_type_ids = (~logits_mask).long()

View File

@ -237,9 +237,9 @@ class BlenderbotAttention(nn.Module):
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
if output_attentions:
# this operation is a bit akward, but it's required to
# this operation is a bit awkward, but it's required to
# make sure that attn_weights keeps its gradient.
# In order to do so, attn_weights have to reshaped
# In order to do so, attn_weights have to be reshaped
# twice and have to be reused in the following
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)

View File

@ -235,9 +235,9 @@ class BlenderbotSmallAttention(nn.Module):
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
if output_attentions:
# this operation is a bit akward, but it's required to
# this operation is a bit awkward, but it's required to
# make sure that attn_weights keeps its gradient.
# In order to do so, attn_weights have to reshaped
# In order to do so, attn_weights have to be reshaped
# twice and have to be reused in the following
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)

View File

@ -695,7 +695,7 @@ CONVBERT_INPUTS_DOCSTRING = r"""
Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **maked**.
- 0 for tokens that are **masked**.
`What are attention masks? <../glossary.html#attention-mask>`__
token_type_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
@ -739,7 +739,7 @@ CONVBERT_INPUTS_DOCSTRING = r"""
@add_start_docstrings(
"The bare ConvBERT Model transformer outputing raw hidden-states without any specific head on top.",
"The bare ConvBERT Model transformer outputting raw hidden-states without any specific head on top.",
CONVBERT_START_DOCSTRING,
)
class TFConvBertModel(TFConvBertPreTrainedModel):

View File

@ -683,7 +683,7 @@ class CTRLForSequenceClassification(CTRLPreTrainedModel):
sequence_lengths = -1
logger.warning(
f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
f"unexpected if using padding tokens in conjuction with `inputs_embeds.`"
f"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
)
pooled_logits = logits[range(batch_size), sequence_lengths]

View File

@ -511,7 +511,7 @@ def build_relative_position(query_size, key_size, bucket_size=-1, max_position=-
query_size (int): the length of query
key_size (int): the length of key
bucket_size (int): the size of position bucket
max_position (int): the maxium allowed absolute positoin
max_position (int): the maximum allowed absolute position
Return:
:obj:`torch.LongTensor`: A tensor with shape [1, query_size, key_size]
@ -698,7 +698,7 @@ class DisentangledSelfAttention(torch.nn.Module):
relative_pos = relative_pos.unsqueeze(1)
# bsz x height x query x key
elif relative_pos.dim() != 4:
raise ValueError(f"Relative postion ids must be of dim 2 or 3 or 4. {relative_pos.dim()}")
raise ValueError(f"Relative position ids must be of dim 2 or 3 or 4. {relative_pos.dim()}")
att_span = self.pos_ebd_size
relative_pos = relative_pos.long().to(query_layer.device)

View File

@ -428,7 +428,7 @@ class SPMTokenizer:
def _is_whitespace(char):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically contorl characters but we treat them
# \t, \n, and \r are technically control characters but we treat them
# as whitespace since they are generally considered as such.
if char == " " or char == "\t" or char == "\n" or char == "\r":
return True

View File

@ -134,7 +134,7 @@ def convert_fsmt_checkpoint_to_pytorch(fsmt_checkpoint_path, pytorch_dump_folder
f.write(json.dumps(src_vocab, ensure_ascii=False, indent=json_indent))
# detect whether this is a do_lower_case situation, which can be derived by checking whether we
# have at least one upcase letter in the source vocab
# have at least one uppercase letter in the source vocab
do_lower_case = True
for k in src_vocab.keys():
if not k.islower():

View File

@ -252,7 +252,7 @@ FSMT_INPUTS_DOCSTRING = r"""
Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**,
- 0 indicates the heas is **masked**.
- 0 indicates the head is **masked**.
decoder_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in ``[0, 1]``:
@ -486,7 +486,7 @@ class FSMTEncoder(nn.Module):
Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**,
- 0 indicates the heas is **masked**.
- 0 indicates the head is **masked**.
Returns:
BaseModelOutput or Tuple comprised of:
@ -696,7 +696,7 @@ class FSMTDecoder(nn.Module):
Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**,
- 0 indicates the heas is **masked**.
- 0 indicates the head is **masked**.
cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(num_layers, num_heads)`, `optional`):
Mask to nullify selected heads of the cross-attention modules. Mask values selected in ``[0, 1]``:

View File

@ -184,7 +184,7 @@ class FunnelAttentionStructure(nn.Module):
self.sin_dropout = nn.Dropout(config.hidden_dropout)
self.cos_dropout = nn.Dropout(config.hidden_dropout)
# Track where we are at in terms of pooling from the original input, e.g., by how much the sequence length was
# dividide.
# divided.
self.pooling_mult = None
def init_attention_inputs(self, inputs_embeds, attention_mask=None, token_type_ids=None):
@ -218,7 +218,7 @@ class FunnelAttentionStructure(nn.Module):
For the factorized attention, it returns the matrices (phi, pi, psi, omega) used in the paper, appendix A.2.2,
final formula.
For the relative shif attention, it returns all possible vectors R used in the paper, appendix A.2.1, final
For the relative shift attention, it returns all possible vectors R used in the paper, appendix A.2.1, final
formula.
Paper link: https://arxiv.org/abs/2006.03236

View File

@ -169,7 +169,7 @@ class TFFunnelAttentionStructure:
For the factorized attention, it returns the matrices (phi, pi, psi, omega) used in the paper, appendix A.2.2,
final formula.
For the relative shif attention, it returns all possible vectors R used in the paper, appendix A.2.1, final
For the relative shift attention, it returns all possible vectors R used in the paper, appendix A.2.1, final
formula.
Paper link: https://arxiv.org/abs/2006.03236
@ -1009,7 +1009,7 @@ class TFFunnelForPreTrainingOutput(ModelOutput):
Args:
logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
Prediction scores of the head (scores for each token before SoftMax).
hidden_states (:obj:`tuple(tf.ensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
shape :obj:`(batch_size, sequence_length, hidden_size)`.

View File

@ -70,7 +70,7 @@ class TFAttention(tf.keras.layers.Layer):
super().__init__(**kwargs)
n_state = nx # in Attention: n_state=768 (nx=n_embd)
# [switch nx => n_state from Block to Attention to keep identical to TF implem]
# [switch nx => n_state from Block to Attention to keep identical to TF implementation]
assert n_state % config.n_head == 0
self.n_ctx = n_ctx
self.n_head = config.n_head

View File

@ -70,7 +70,7 @@ def bytes_to_unicode():
The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
decent coverage. This is a signficant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
tables between utf-8 bytes and unicode strings.
"""
bs = (
@ -189,7 +189,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
self.cache = {}
self.add_prefix_space = add_prefix_space
# Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
# Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
@property
@ -247,7 +247,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
for token in re.findall(self.pat, text):
token = "".join(
self.byte_encoder[b] for b in token.encode("utf-8")
) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
return bpe_tokens

View File

@ -221,7 +221,7 @@ class GPTNeoAttentionMixin:
if attention_mask is None:
attention_mask = torch.ones(batch_size, seq_length, dtype=torch.long, device=device)
# A block can also be padded becuase of the _look_back operation
# A block can also be padded because of the _look_back operation
# look back into the attention_block such that it will also get padded the same way
# and have 0s in the padded position
attention_mask = GPTNeoAttentionMixin._look_back(attention_mask, block_length, window_size, is_key_value=False)
@ -804,8 +804,8 @@ class GPTNeoModel(GPTNeoPreTrainedModel):
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x num_headss x N x N
# head_mask has shape n_layer x batch x num_headss x N x N
# attention_probs has shape bsz x num_heads x N x N
# head_mask has shape n_layer x batch x num_heads x N x N
head_mask = self.get_head_mask(head_mask, self.config.num_layers)
if inputs_embeds is None:

View File

@ -35,11 +35,11 @@ class QuantEmbedding(nn.Module):
:obj:`torch.nn.Embedding`.
Args:
weight_bit (:obj:`int`, `optiona`l, defaults to :obj:`8`):
weight_bit (:obj:`int`, `optional`, defaults to :obj:`8`):
Bitwidth for the quantized weight.
momentum (:obj:`float`, `optional, defaults to :obj:`0.95`):
momentum (:obj:`float`, `optional`, defaults to :obj:`0.95`):
Momentum for updating the activation quantization range.
quant_mode (:obj:`bool`, `optional, defaults to :obj:`False`):
quant_mode (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the layer is quantized.
"""
@ -619,7 +619,7 @@ def symmetric_linear_quantization_params(num_bits, saturation_min, saturation_ma
`saturation_max`.
"""
# in this part, we do not need any gradient computation,
# in order to enfore this, we put torch.no_grad()
# in order to enforce this, we put torch.no_grad()
with torch.no_grad():
n = 2 ** (num_bits - 1) - 1

View File

@ -842,9 +842,9 @@ class LEDDecoderAttention(nn.Module):
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
if output_attentions:
# this operation is a bit akward, but it's required to
# this operation is a bit awkward, but it's required to
# make sure that attn_weights keeps its gradient.
# In order to do so, attn_weights have to reshaped
# In order to do so, attn_weights have to be reshaped
# twice and have to be reused in the following
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
@ -1499,7 +1499,7 @@ LED_INPUTS_DOCSTRING = r"""
Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**,
- 0 indicates the heas is **masked**.
- 0 indicates the head is **masked**.
decoder_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in ``[0, 1]``:
@ -1689,7 +1689,7 @@ class LEDEncoder(LEDPreTrainedModel):
Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**,
- 0 indicates the heas is **masked**.
- 0 indicates the head is **masked**.
inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
@ -1920,7 +1920,7 @@ class LEDDecoder(LEDPreTrainedModel):
Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**,
- 0 indicates the heas is **masked**.
- 0 indicates the head is **masked**.
cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
Mask to nullify selected heads of the cross-attention modules. Mask values selected in ``[0, 1]``:

View File

@ -869,7 +869,7 @@ class TFLEDEncoderSelfAttention(tf.keras.layers.Layer):
# compute global attn probs
global_attn_probs_float = tf.nn.softmax(global_attn_scores, axis=-1)
# apply layer head maskin
# apply layer head masking
if layer_head_mask is not None:
if tf.executing_eagerly():
tf.debugging.assert_equal(
@ -1552,7 +1552,7 @@ LED_INPUTS_DOCSTRING = r"""
Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**,
- 0 indicates the heas is **masked**.
- 0 indicates the head is **masked**.
decoder_head_mask (:obj:`tf.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in ``[0, 1]``:
@ -1667,7 +1667,7 @@ class TFLEDEncoder(tf.keras.layers.Layer):
Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**,
- 0 indicates the heas is **masked**.
- 0 indicates the head is **masked**.
inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
@ -1926,14 +1926,14 @@ class TFLEDDecoder(tf.keras.layers.Layer):
Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**,
- 0 indicates the heas is **masked**.
- 0 indicates the head is **masked**.
encoder_head_mask (:obj:`tf.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`):
Mask to nullify selected heads of the attention modules in encoder to avoid performing cross-attention
on hidden heads. Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**,
- 0 indicates the heas is **masked**.
- 0 indicates the head is **masked**.
past_key_values (:obj:`Tuple[Tuple[tf.Tensor]]` of length :obj:`config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up

View File

@ -393,7 +393,7 @@ class LongformerTokenClassifierOutput(ModelOutput):
def _get_question_end_index(input_ids, sep_token_id):
"""
Computes the index of the first occurance of `sep_token_id`.
Computes the index of the first occurrence of `sep_token_id`.
"""
sep_token_indices = (input_ids == sep_token_id).nonzero()
@ -1428,7 +1428,7 @@ LONGFORMER_INPUTS_DOCSTRING = r"""
Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**,
- 0 indicates the heas is **masked**.
- 0 indicates the head is **masked**.
decoder_head_mask (:obj:`torch.Tensor` of shape :obj:`(num_layers, num_heads)`, `optional`):
Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in ``[0, 1]``:

View File

@ -1388,7 +1388,7 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
# compute global attn probs
global_attn_probs_float = tf.nn.softmax(global_attn_scores, axis=-1)
# apply layer head maskin
# apply layer head masking
if layer_head_mask is not None:
if tf.executing_eagerly():
tf.debugging.assert_equal(
@ -1707,7 +1707,7 @@ class TFLongformerMainLayer(tf.keras.layers.Layer):
inputs["attention_mask"], (attention_mask_shape[0], attention_mask_shape[1], 1, 1)
)
# Since attention_mask is 1.0 for positions we want to locall attend locally and 0.0 for
# Since attention_mask is 1.0 for positions we want to attend locally and 0.0 for
# masked and global attn positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
@ -1920,7 +1920,7 @@ LONGFORMER_INPUTS_DOCSTRING = r"""
Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**,
- 0 indicates the heas is **masked**.
- 0 indicates the head is **masked**.
global_attention_mask (:obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
Mask to decide the attention given on each token, local attention or global attention. Tokens with global

View File

@ -95,9 +95,9 @@ class LxmertConfig(PretrainedConfig):
Whether or not to add masked language modeling (as used in pretraining models such as BERT) to the loss
objective.
task_obj_predict (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to add object prediction, attribute ppredictionand feature regression to the loss objective.
Whether or not to add object prediction, attribute prediction and feature regression to the loss objective.
task_qa (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to add the question-asansweringoss to the objective
Whether or not to add the question-answering loss to the objective
visual_obj_loss (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to calculate the object-prediction loss objective
visual_attr_loss (:obj:`bool`, `optional`, defaults to :obj:`True`):

View File

@ -306,9 +306,9 @@ class M2M100Attention(nn.Module):
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
if output_attentions:
# this operation is a bit akward, but it's required to
# this operation is a bit awkward, but it's required to
# make sure that attn_weights keeps its gradient.
# In order to do so, attn_weights have to reshaped
# In order to do so, attn_weights have to be reshaped
# twice and have to be reused in the following
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)

View File

@ -252,9 +252,9 @@ class MarianAttention(nn.Module):
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
if output_attentions:
# this operation is a bit akward, but it's required to
# this operation is a bit awkward, but it's required to
# make sure that attn_weights keeps its gradient.
# In order to do so, attn_weights have to reshaped
# In order to do so, attn_weights have to be reshaped
# twice and have to be reused in the following
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)

View File

@ -117,7 +117,7 @@ class MBartLearnedPositionalEmbedding(nn.Embedding):
def __init__(self, num_embeddings: int, embedding_dim: int):
# MBart is set up so that if padding_idx is specified then offset the embedding ids by 2
# and adjust num_embeddings appropriately. Other models dont have this hack
# and adjust num_embeddings appropriately. Other models don't have this hack
self.offset = 2
super().__init__(num_embeddings + self.offset, embedding_dim)
@ -243,9 +243,9 @@ class MBartAttention(nn.Module):
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
if output_attentions:
# this operation is a bit akward, but it's required to
# this operation is a bit awkward, but it's required to
# make sure that attn_weights keeps its gradient.
# In order to do so, attn_weights have to reshaped
# In order to do so, attn_weights have to be reshaped
# twice and have to be reused in the following
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)

View File

@ -118,7 +118,7 @@ class TFMBartLearnedPositionalEmbedding(TFSharedEmbeddings):
def __init__(self, num_embeddings: int, embedding_dim: int, **kwargs):
# MBart is set up so that if padding_idx is specified then offset the embedding ids by 2
# and adjust num_embeddings appropriately. Other models dont have this hack
# and adjust num_embeddings appropriately. Other models don't have this hack
self.offset = 2
super().__init__(num_embeddings + self.offset, embedding_dim, **kwargs)
@ -690,7 +690,7 @@ class TFMBartEncoder(tf.keras.layers.Layer):
Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**,
- 0 indicates the heas is **masked**.
- 0 indicates the head is **masked**.
inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded

View File

@ -1487,7 +1487,7 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel):
@add_start_docstrings(
"""
MoibleBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
MobileBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
for Named-Entity-Recognition (NER) tasks.
""",
MOBILEBERT_START_DOCSTRING,

View File

@ -674,7 +674,7 @@ MPNET_INPUTS_DOCSTRING = r"""
@add_start_docstrings(
"The bare MPNet Model transformer outputing raw hidden-states without any specific head on top.",
"The bare MPNet Model transformer outputting raw hidden-states without any specific head on top.",
MPNET_START_DOCSTRING,
)
class TFMPNetModel(TFMPNetPreTrainedModel):

View File

@ -154,7 +154,7 @@ class MPNetTokenizerFast(PreTrainedTokenizerFast):
:obj:`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
not having been set.
MPNet tokenizer has a special mask token to be usble in the fill-mask pipeline. The mask token will greedily
MPNet tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
comprise the space before the `<mask>`.
"""
if self._mask_token is None and self.verbose:

View File

@ -146,7 +146,7 @@ class Attention(nn.Module):
def __init__(self, nx, n_ctx, config, scale=False):
super().__init__()
n_state = nx # in Attention: n_state=768 (nx=n_embd)
# [switch nx => n_state from Block to Attention to keep identical to TF implem]
# [switch nx => n_state from Block to Attention to keep identical to TF implementation]
assert n_state % config.n_head == 0
self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
self.n_head = config.n_head
@ -178,7 +178,7 @@ class Attention(nn.Module):
w = torch.matmul(q, k)
if self.scale:
w = w / math.sqrt(v.size(-1))
# w = w * self.bias + -1e9 * (1 - self.bias) # TF implem method: mask_attn_weights
# w = w * self.bias + -1e9 * (1 - self.bias) # TF implementation method: mask_attn_weights
# XD: self.b may be larger than w, so we need to crop it
b = self.bias[:, :, : w.size(-2), : w.size(-1)]
w = w * b + -1e4 * (1 - b)
@ -202,11 +202,11 @@ class Attention(nn.Module):
def merge_heads(self, x):
x = x.permute(0, 2, 1, 3).contiguous()
new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
return x.view(*new_x_shape) # in Tensorflow implem: fct merge_states
return x.view(*new_x_shape) # in Tensorflow implementation: fct merge_states
def split_heads(self, x, k=False):
new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
x = x.view(*new_x_shape) # in Tensorflow implem: fct split_states
x = x.view(*new_x_shape) # in Tensorflow implementation: fct split_states
if k:
return x.permute(0, 2, 3, 1)
else:
@ -467,7 +467,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
raise ValueError("You have to specify either input_ids or inputs_embeds")
if position_ids is None:
# Code is different from when we had a single embedding matrice from position and token embeddings
# Code is different from when we had a single embedding matrix from position and token embeddings
position_ids = self.position_ids[None, : input_shape[-1]]
# Attention mask.
@ -814,7 +814,7 @@ class OpenAIGPTForSequenceClassification(OpenAIGPTPreTrainedModel):
sequence_lengths = -1
logger.warning(
f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
f"unexpected if using padding tokens in conjuction with `inputs_embeds.`"
f"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
)
pooled_logits = logits[range(batch_size), sequence_lengths]

View File

@ -62,7 +62,7 @@ class TFAttention(tf.keras.layers.Layer):
super().__init__(**kwargs)
n_state = nx # in Attention: n_state=768 (nx=n_embd)
# [switch nx => n_state from Block to Attention to keep identical to TF implem]
# [switch nx => n_state from Block to Attention to keep identical to TF implementation]
assert (
n_state % config.n_head == 0
), f"Hidden dimension {n_state} not dividable by number of heads {config.n_head}"

View File

@ -252,9 +252,9 @@ class PegasusAttention(nn.Module):
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
if output_attentions:
# this operation is a bit akward, but it's required to
# this operation is a bit awkward, but it's required to
# make sure that attn_weights keeps its gradient.
# In order to do so, attn_weights have to reshaped
# In order to do so, attn_weights have to be reshaped
# twice and have to be reused in the following
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)

View File

@ -719,7 +719,7 @@ class TFPegasusEncoder(tf.keras.layers.Layer):
Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**,
- 0 indicates the heas is **masked**.
- 0 indicates the head is **masked**.
inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded

View File

@ -723,9 +723,9 @@ class ProphetNetAttention(nn.Module):
attn_weights = attn_weights + attention_mask
if output_attentions:
# this operation is a bit akward, but it's required to
# this operation is a bit awkward, but it's required to
# make sure that attn_weights keeps its gradient.
# In order to do so, attn_weights have to reshaped
# In order to do so, attn_weights have to be reshaped
# twice and have to be reused in the following
attn_weights_reshaped = attn_weights.view(batch_size, self.num_attn_heads, tgt_len, src_len)
attn_weights = attn_weights_reshaped.view(batch_size * self.num_attn_heads, tgt_len, src_len)
@ -1243,7 +1243,7 @@ class ProphetNetEncoder(ProphetNetPreTrainedModel):
r"""
word_embeddings (:obj:`torch.nn.Embeddings` of shape :obj:`(config.vocab_size, config.hidden_size)`, `optional`):
The word embedding parameters. This can be used to initialize :class:`~transformers.ProphetNetEncoder` with
pre-defined word embeddings instead of randomely initialized word embeddings.
pre-defined word embeddings instead of randomly initialized word embeddings.
"""
def __init__(self, config: ProphetNetConfig, word_embeddings: nn.Embedding = None):
@ -1380,7 +1380,7 @@ class ProphetNetDecoder(ProphetNetPreTrainedModel):
r"""
word_embeddings (:obj:`torch.nn.Embeddings` of shape :obj:`(config.vocab_size, config.hidden_size)`, `optional`):
The word embedding parameters. This can be used to initialize :class:`~transformers.ProphetNetEncoder` with
pre-defined word embeddings instead of randomely initialized word embeddings.
pre-defined word embeddings instead of randomly initialized word embeddings.
"""
def __init__(self, config: ProphetNetConfig, word_embeddings: nn.Embedding = None):

View File

@ -285,7 +285,7 @@ class TFRagPreTrainedModel(TFPreTrainedModel):
>>> # load retriever
>>> retriever = RagRetriever.from_pretrained(PATH, index_name="exact", use_dummy_dataset=True)
>>> # load fine-tuned model with retriver
>>> # load fine-tuned model with retriever
>>> model = TFRagModel.from_pretrained("./rag", retriever=retriever)
"""

View File

@ -234,7 +234,7 @@ class CanonicalHFIndex(HFIndexBase):
Args:
vector_size (:obj:`int`): the dimension of the passages embeddings used by the index
dataset_name (:obj:`str`, optional, defaults to ``wiki_dpr``):
A datatset identifier of the indexed dataset on HuggingFace AWS bucket (list all available datasets and ids
A dataset identifier of the indexed dataset on HuggingFace AWS bucket (list all available datasets and ids
with ``datasets.list_datasets()``).
dataset_split (:obj:`str`, optional, defaults to ``train``)
Which split of the ``dataset`` to load.
@ -442,7 +442,7 @@ class RagRetriever:
def init_retrieval(self):
"""
Retriever initalization function. It loads the index into memory.
Retriever initialization function. It loads the index into memory.
"""
logger.info("initializing retrieval")

View File

@ -612,7 +612,7 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
if isinstance(self.num_buckets, int):
assert (
self.num_buckets % 2 == 0
), f"There should be an even number of bucktes, but `self.num_bucktes`: {self.num_buckets}"
), f"There should be an even number of buckets, but `self.num_buckets`: {self.num_buckets}"
rotation_size = self.num_buckets
num_buckets = self.num_buckets
else:

View File

@ -179,7 +179,7 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
:obj:`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
not having been set.
Roberta tokenizer has a special mask token to be usble in the fill-mask pipeline. The mask token will greedily
Roberta tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
comprise the space before the `<mask>`.
"""
if self._mask_token is None and self.verbose:

View File

@ -319,9 +319,9 @@ class Speech2TextAttention(nn.Module):
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
if output_attentions:
# this operation is a bit akward, but it's required to
# this operation is a bit awkward, but it's required to
# make sure that attn_weights keeps its gradient.
# In order to do so, attn_weights have to reshaped
# In order to do so, attn_weights have to be reshaped
# twice and have to be reused in the following
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
@ -559,7 +559,7 @@ class Speech2TextPreTrainedModel(PreTrainedModel):
return input_lengths
def _get_subsampled_encoder_attn_mask(self, attention_mask):
# generate creates 3D attention mask, becuase of the shape of input_features
# generate creates 3D attention mask, because of the shape of input_features
# convert it to 2D if thats the case
if len(attention_mask.shape) > 2:
attention_mask = attention_mask[:, :, -1]

View File

@ -1172,7 +1172,7 @@ T5_ENCODER_INPUTS_DOCSTRING = r"""
Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
"""
# Warning messafe for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
# Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
__HEAD_MASK_WARNING_MSG = """
The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently,
`decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions.

View File

@ -637,7 +637,7 @@ class TFT5MainLayer(tf.keras.layers.Layer):
raise ValueError(f"You have to specify either {err_msg_prefix}inputs or {err_msg_prefix}inputs_embeds")
if inputs["inputs_embeds"] is None:
assert self.embed_tokens is not None, "You have to intialize the model with valid token embeddings"
assert self.embed_tokens is not None, "You have to initialize the model with valid token embeddings"
inputs["inputs_embeds"] = self.embed_tokens(inputs["input_ids"])
batch_size, seq_length = input_shape

View File

@ -2021,7 +2021,7 @@ def _calculate_aggregate_mask(answer, pooled_output, cell_selection_preference,
apply to numbers. If the answer is a number but does not appear in the table then we must use some aggregation
case. The ambiguous case is when the answer is a number that also appears in the table. In this case we use the
aggregation function probabilities predicted by the model to decide whether to select or aggregate. The threshold
for this is a hyperparameter `cell_selection_preference
for this is a hyperparameter `cell_selection_preference`
Args:
answer (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, )`):
@ -2041,7 +2041,7 @@ def _calculate_aggregate_mask(answer, pooled_output, cell_selection_preference,
aggregate_mask_init = torch.logical_not(torch.isnan(answer)).type(torch.FloatTensor).to(answer.device)
logits_aggregation = aggregation_classifier(pooled_output)
dist_aggregation = torch.distributions.categorical.Categorical(logits=logits_aggregation)
# Index 0 correponds to "no aggregation".
# Index 0 corresponds to "no aggregation".
aggregation_ops_total_mass = torch.sum(dist_aggregation.probs[:, 1:], dim=1)
# Cell selection examples according to current model.
@ -2126,7 +2126,7 @@ def _calculate_aggregation_loss_unknown(logits_aggregation, aggregate_mask):
answer supervision) per example.
"""
dist_aggregation = torch.distributions.categorical.Categorical(logits=logits_aggregation)
# Index 0 correponds to "no aggregation".
# Index 0 corresponds to "no aggregation".
aggregation_ops_total_mass = torch.sum(dist_aggregation.probs[:, 1:], dim=1)
# Predict some aggregation in case of an answer that needs aggregation.
# This increases the probability of all aggregation functions, in a way

View File

@ -2357,7 +2357,7 @@ def _get_numeric_value_from_date(date, mask):
def _get_span_length_key(span):
"""Sorts span by decreasing length first and incresing first index second."""
"""Sorts span by decreasing length first and increasing first index second."""
return span[1] - span[0], -span[0]

View File

@ -292,7 +292,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
elif "<unk>" in self.sym2idx:
self.unk_idx = self.sym2idx["<unk>"]
else:
raise ValueError("No <unkown> token in vocabulary")
raise ValueError("No <unknown> token in vocabulary")
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if os.path.isdir(save_directory):

View File

@ -382,9 +382,9 @@ class Wav2Vec2Attention(nn.Module):
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
if output_attentions:
# this operation is a bit akward, but it's required to
# this operation is a bit awkward, but it's required to
# make sure that attn_weights keeps its gradient.
# In order to do so, attn_weights have to reshaped
# In order to do so, attn_weights have to be reshaped
# twice and have to be reused in the following
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)

View File

@ -111,7 +111,7 @@ class Wav2Vec2Processor:
When used in normal mode, this method forwards all its arguments to Wav2Vec2FeatureExtractor's
:meth:`~transformers.Wav2Vec2FeatureExtractor.__call__` and returns its output. If used in the context
:meth:`~transformers.Wav2Vec2Processor.as_target_processor` this method forwards all its arguments to
Wav2Vec2CTCTokenizer's :meth:`~transformers.Wav2Vec2CTCTokenizer.__call__`. Please refer to the doctsring of
Wav2Vec2CTCTokenizer's :meth:`~transformers.Wav2Vec2CTCTokenizer.__call__`. Please refer to the docstring of
the above two methods for more information.
"""
return self.current_processor(*args, **kwargs)

View File

@ -348,7 +348,7 @@ XLM_INPUTS_DOCSTRING = r"""
A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
languages ids which can be obtained from the language names by using two conversion mappings provided in
the configuration of the model (only provided for multilingual models). More precisely, the `language name
to language id` mapping is in :obj:`model.config.lang2id` (which is a dictionary strring to int) and the
to language id` mapping is in :obj:`model.config.lang2id` (which is a dictionary string to int) and the
`language id to language name` mapping is in :obj:`model.config.id2lang` (dictionary int to string).
See usage examples detailed in the :doc:`multilingual documentation <../multilingual>`.
@ -1188,7 +1188,7 @@ class XLMForMultipleChoice(XLMPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, num_choicec, sequence_length"))
@add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,

View File

@ -749,8 +749,8 @@ class XLMTokenizer(PreTrainedTokenizer):
def _tokenize(self, text, lang="en", bypass_tokenizer=False):
"""
Tokenize a string given language code. For Chinese, Japanese and Thai, we use a language specific
tokenizerself. Otherwise, we use Moses.
Tokenize a string given language code. For Chinese, Japanese and Thai, we use a language specific tokenizer.
Otherwise, we use Moses.
Details of tokenization:

View File

@ -1113,7 +1113,7 @@ XLNET_INPUTS_DOCSTRING = r"""
Mask values selected in ``[0, 1]``:
- 1 for tokens that are **masked**,
- 0 for tokens that are **not maked**.
- 0 for tokens that are **not masked**.
You can only uses one of :obj:`input_mask` and :obj:`attention_mask`.
head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):

View File

@ -182,7 +182,7 @@ class AdamWeightDecay(tf.keras.optimizers.Adam):
name (:obj:`str`, `optional`, defaults to 'AdamWeightDecay'):
Optional name for the operations created when applying gradients.
kwargs:
Keyward arguments. Allowed to be {``clipnorm``, ``clipvalue``, ``lr``, ``decay``}. ``clipnorm`` is clip
Keyword arguments. Allowed to be {``clipnorm``, ``clipvalue``, ``lr``, ``decay``}. ``clipnorm`` is clip
gradients by norm; ``clipvalue`` is clip gradients by value, ``decay`` is included for backward
compatibility to allow time inverse decay of learning rate. ``lr`` is included for backward compatibility,
recommended to use ``learning_rate`` instead.

View File

@ -128,7 +128,7 @@ class Conversation:
"""
Iterates over all blobs of the conversation.
Retuns: Iterator of (is_user, text_chunk) in chronological order of the conversation. ``is_user`` is a
Returns: Iterator of (is_user, text_chunk) in chronological order of the conversation. ``is_user`` is a
:obj:`bool`, ``text_chunks`` is a :obj:`str`.
"""
for user_input, generated_response in zip(self.past_user_inputs, self.generated_responses):

View File

@ -48,7 +48,7 @@ class Text2TextGenerationPipeline(Pipeline):
def check_inputs(self, input_length: int, min_length: int, max_length: int):
"""
Checks wether there might be something wrong with given input with regard to the model.
Checks whether there might be something wrong with given input with regard to the model.
"""
return True
@ -204,7 +204,7 @@ class SummarizationPipeline(Text2TextGenerationPipeline):
def check_inputs(self, input_length: int, min_length: int, max_length: int) -> bool:
"""
Checks wether there might be something wrong with given input with regard to the model.
Checks whether there might be something wrong with given input with regard to the model.
"""
if input_length < min_length // 2:
logger.warning(

View File

@ -520,7 +520,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
filename_prefix: Optional[str] = None,
) -> Tuple[str]:
"""
Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens as well asin a unique JSON
Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens as well as in a unique JSON
file containing {config + vocab + added-tokens}.
"""
save_directory = str(save_directory)

View File

@ -208,7 +208,7 @@ class Trainer:
Note that if it's a :obj:`torch.utils.data.dataset.IterableDataset` with some randomization and you are
training in a distributed fashion, your iterable dataset should either use a internal attribute
:obj:`generator` that is a :obj:`torch.Generator` for the randomization that must be identic on all
:obj:`generator` that is a :obj:`torch.Generator` for the randomization that must be identical on all
processes (and the Trainer will manually set the seed of this :obj:`generator` at each epoch) or have a
:obj:`set_epoch()` method that internally sets the seed of the RNGs used.
eval_dataset (:obj:`torch.utils.data.dataset.Dataset`, `optional`):

View File

@ -42,7 +42,7 @@ class TrainerState:
In all this class, one step is to be understood as one update step. When using gradient accumulation, one
update step may require several forward and backward passes: if you use :obj:`gradient_accumulation_steps=n`,
then one update step requires going throuch `n` batches.
then one update step requires going through `n` batches.
Args:
epoch (:obj:`float`, `optional`):

View File

@ -243,7 +243,7 @@ class SequentialDistributedSampler(Sampler):
def __init__(self, dataset, num_replicas=None, rank=None, batch_size=None):
warnings.warn(
"SequentialDistributedSampler is deprecated and will be removed in v5 of Tranformers.",
"SequentialDistributedSampler is deprecated and will be removed in v5 of Transformers.",
FutureWarning,
)
if num_replicas is None:
@ -363,7 +363,7 @@ class DistributedTensorGatherer:
def __init__(self, world_size, num_samples, make_multiple_of=None, padding_index=-100):
warnings.warn(
"DistributedTensorGatherer is deprecated and will be removed in v5 of Tranformers.",
"DistributedTensorGatherer is deprecated and will be removed in v5 of Transformers.",
FutureWarning,
)
self.world_size = world_size

View File

@ -199,7 +199,7 @@ class Seq2SeqTrainer(Trainer):
def _pad_tensors_to_max_len(self, tensor, max_length):
if self.tokenizer is None:
raise ValueError(
f"Tensor need to be padded to `max_length={max_length}` but no tokenzier was passed when creating "
f"Tensor need to be padded to `max_length={max_length}` but no tokenizer was passed when creating "
"this `Trainer`. Make sure to create your `Trainer` with the appropriate tokenizer."
)
# If PAD token is not defined at least EOS token has to be defined

View File

@ -140,7 +140,7 @@ def get_verbosity() -> int:
def set_verbosity(verbosity: int) -> None:
"""
Set the vebosity level for the 🤗 Transformers's root logger.
Set the verbosity level for the 🤗 Transformers's root logger.
Args:
verbosity (:obj:`int`):