Testing that batch_encode_plus is the same as encode_plus (#2973)

* Testing that encode_plus and batch_encode_plus behave the same way

Spoiler alert: they don't

* Testing rest of arguments in batch_encode_plus

* Test tensor return in batch_encode_plus

* Addressing Sam's comments

* flake8

* Simplified with `num_added_tokens`
This commit is contained in:
Lysandre Debut 2020-02-24 12:09:46 -05:00 committed by GitHub
parent 17c45c39ed
commit 21d8b6a33e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 222 additions and 39 deletions

View File

@ -98,6 +98,12 @@ class T5Tokenizer(PreTrainedTokenizer):
additional_special_tokens=additional_special_tokens, additional_special_tokens=additional_special_tokens,
**kwargs, **kwargs,
) )
self.max_len_single_sentence = (
self.max_len
) # no default special tokens - you can update this value if you add special tokens
self.max_len_sentences_pair = (
self.max_len
) # no default special tokens - you can update this value if you add special tokens
try: try:
import sentencepiece as spm import sentencepiece as spm

View File

@ -14,7 +14,6 @@
# limitations under the License. # limitations under the License.
"""Tokenization classes for OpenAI GPT.""" """Tokenization classes for OpenAI GPT."""
import copy import copy
import itertools import itertools
import json import json
@ -153,6 +152,18 @@ class PreTrainedTokenizer(object):
padding_side = "right" padding_side = "right"
NO_PAD_TOKEN_FOR_BATCH_MSG = (
"No padding token is set for this model, therefore no batch can be made with uneven "
"sequences. Set a padding token or adjust the lengths of the sequences building the "
"batch so that every sequence is of the same length."
)
UNEVEN_SEQUENCES_FOR_BATCH_MSG = (
"The sequences building the batch are not of the same size, no tensor "
"can be built. Set `pad_to_max_length=True` to pad the smaller sequences"
"up to the larger sequence's length."
)
@property @property
def bos_token(self): def bos_token(self):
""" Beginning of sentence token (string). Log an error if used while not having been set. """ """ Beginning of sentence token (string). Log an error if used while not having been set. """
@ -1020,14 +1031,18 @@ class PreTrainedTokenizer(object):
def batch_encode_plus( def batch_encode_plus(
self, self,
batch_text_or_text_pairs=None, batch_text_or_text_pairs=None,
add_special_tokens=False, add_special_tokens=True,
max_length=None, max_length=None,
stride=0, stride=0,
truncation_strategy="longest_first", truncation_strategy="longest_first",
pad_to_max_length=False,
return_tensors=None, return_tensors=None,
return_input_lengths=False, return_token_type_ids=True,
return_attention_masks=False, return_attention_masks=True,
return_overflowing_tokens=False,
return_special_tokens_masks=False,
return_offsets_mapping=False, return_offsets_mapping=False,
return_input_lengths=False,
**kwargs **kwargs
): ):
""" """
@ -1050,14 +1065,54 @@ class PreTrainedTokenizer(object):
- 'only_first': Only truncate the first sequence - 'only_first': Only truncate the first sequence
- 'only_second': Only truncate the second sequence - 'only_second': Only truncate the second sequence
- 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and
padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length.
The tokenizer padding sides are handled by the class attribute `padding_side` which can be set to the following strings:
- 'left': pads on the left of the sequences
- 'right': pads on the right of the sequences
Defaults to False: no padding.
return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
or PyTorch torch.Tensor instead of a list of python integers. or PyTorch torch.Tensor instead of a list of python integers.
return_input_lengths: (optional) If set the resulting dictionary will include the length of each sample return_input_lengths: (optional) If set the resulting dictionary will include the length of each sample
return_attention_masks: (optional) Set to True to return the attention mask (default False) return_attention_masks: (optional) Set to True to return the attention mask (default False)
return_offsets_mapping: (optional) Not available, should be set to False or it will throw NotImplementError return_offsets_mapping: (optional) Not available, should be set to False or it will throw NotImplementError
**kwargs: passed to the `self.tokenize()` method **kwargs: passed to the `self.tokenize()` method
Return:
A Dictionary of shape::
{
input_ids: list[List[int]],
token_type_ids: list[List[int]] if return_token_type_ids is True (default)
attention_mask: list[List[int]] if return_attention_mask is True (default)
overflowing_tokens: list[List[int]] if a ``max_length`` is specified and return_overflowing_tokens is True
num_truncated_tokens: List[int] if a ``max_length`` is specified and return_overflowing_tokens is True
special_tokens_mask: list[List[int]] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True
}
With the fields:
``input_ids``: list of token ids to be fed to a model
``token_type_ids``: list of token type ids to be fed to a model
``attention_mask``: list of indices specifying which tokens should be attended to by the model
``overflowing_tokens``: list of overflowing tokens if a max length is specified.
``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
tokens and 1 specifying sequence tokens.
""" """
def get_input_ids(text):
if isinstance(text, str):
tokens = self.tokenize(text, add_special_tokens=add_special_tokens, **kwargs)
return self.convert_tokens_to_ids(tokens)
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
return self.convert_tokens_to_ids(text)
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
return text
else:
raise ValueError(
"Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
)
if return_offsets_mapping: if return_offsets_mapping:
raise NotImplementedError( raise NotImplementedError(
"return_offset_mapping is not available when using Python tokenizers." "return_offset_mapping is not available when using Python tokenizers."
@ -1067,21 +1122,47 @@ class PreTrainedTokenizer(object):
"https://github.com/huggingface/transformers/pull/2674" "https://github.com/huggingface/transformers/pull/2674"
) )
batch_outputs = {} input_ids = []
for ids_or_pair_ids in batch_text_or_text_pairs: for ids_or_pair_ids in batch_text_or_text_pairs:
if isinstance(ids_or_pair_ids, (list, tuple)): if isinstance(ids_or_pair_ids, (list, tuple)):
assert len(ids_or_pair_ids) == 2 assert len(ids_or_pair_ids) == 2
ids, pair_ids = ids_or_pair_ids ids, pair_ids = ids_or_pair_ids
else: else:
ids, pair_ids = ids_or_pair_ids, None ids, pair_ids = ids_or_pair_ids, None
outputs = self.encode_plus(
ids, first_ids = get_input_ids(ids)
pair_ids, second_ids = get_input_ids(pair_ids) if pair_ids is not None else None
add_special_tokens=add_special_tokens, input_ids.append((first_ids, second_ids))
if max_length is None and pad_to_max_length:
def total_sequence_length(input_pairs):
first_ids, second_ids = input_pairs
return len(first_ids) + (
self.num_added_tokens()
if second_ids is None
else (len(second_ids) + self.num_added_tokens(pair=True))
)
max_length = max([total_sequence_length(ids) for ids in input_ids])
batch_outputs = {}
for first_ids, second_ids in input_ids:
# Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by
# the model. It adds special tokens, truncates sequences if overflowing while taking into account
# the special tokens and manages a window stride for overflowing tokens
outputs = self.prepare_for_model(
first_ids,
pair_ids=second_ids,
max_length=max_length, max_length=max_length,
pad_to_max_length=pad_to_max_length,
add_special_tokens=add_special_tokens,
stride=stride, stride=stride,
truncation_strategy=truncation_strategy, truncation_strategy=truncation_strategy,
return_tensors=None, return_attention_mask=return_attention_masks,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_masks,
) )
# Append the non-padded length to the output # Append the non-padded length to the output
@ -1093,31 +1174,28 @@ class PreTrainedTokenizer(object):
batch_outputs[key] = [] batch_outputs[key] = []
batch_outputs[key].append(value) batch_outputs[key].append(value)
# Compute longest sequence size
max_seq_len = max(map(len, batch_outputs["input_ids"]))
if return_attention_masks:
# Allow the model to not give any special attention to padded input
batch_outputs["attention_mask"] = [[0] * len(v) for v in batch_outputs["input_ids"]]
if return_tensors is not None: if return_tensors is not None:
# Do the tensor conversion in batch # Do the tensor conversion in batch
for key, value in batch_outputs.items(): for key, value in batch_outputs.items():
padded_value = value
# verify that the tokenizer has a pad_token_id
if key != "input_len" and self._pad_token is not None:
# Padding handle
padded_value = [
v + [self.pad_token_id if key == "input_ids" else 1] * (max_seq_len - len(v))
for v in padded_value
]
if return_tensors == "tf" and is_tf_available(): if return_tensors == "tf" and is_tf_available():
batch_outputs[key] = tf.constant(padded_value) try:
batch_outputs[key] = tf.constant(value)
except ValueError:
if None in [item for sequence in value for item in sequence]:
raise ValueError(self.NO_PAD_TOKEN_FOR_BATCH_MSG)
else:
raise ValueError(self.UNEVEN_SEQUENCES_FOR_BATCH_MSG)
elif return_tensors == "pt" and is_torch_available(): elif return_tensors == "pt" and is_torch_available():
batch_outputs[key] = torch.tensor(padded_value) try:
batch_outputs[key] = torch.tensor(value)
except ValueError:
raise ValueError(self.UNEVEN_SEQUENCES_FOR_BATCH_MSG)
except RuntimeError:
if None in [item for sequence in value for item in sequence]:
raise ValueError(self.NO_PAD_TOKEN_FOR_BATCH_MSG)
else:
raise
elif return_tensors is not None: elif return_tensors is not None:
logger.warning( logger.warning(
"Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format( "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(
@ -1125,13 +1203,6 @@ class PreTrainedTokenizer(object):
) )
) )
# encoder_attention_mask requires 1 for real token, 0 for padding, just invert value
if return_attention_masks:
if is_tf_available():
batch_outputs["attention_mask"] = tf.abs(batch_outputs["attention_mask"] - 1)
else:
batch_outputs["attention_mask"] = torch.abs(batch_outputs["attention_mask"] - 1)
return batch_outputs return batch_outputs
def prepare_for_model( def prepare_for_model(

View File

@ -19,6 +19,8 @@ import pickle
import shutil import shutil
import tempfile import tempfile
from tests.utils import require_tf, require_torch
class TokenizerTesterMixin: class TokenizerTesterMixin:
@ -40,6 +42,15 @@ class TokenizerTesterMixin:
def get_input_output_texts(self): def get_input_output_texts(self):
raise NotImplementedError raise NotImplementedError
@staticmethod
def convert_batch_encode_plus_format_to_encode_plus(batch_encode_plus_sequences):
# Switch from batch_encode_plus format: {'input_ids': [[...], [...]], ...}
# to the concatenated encode_plus format: [{'input_ids': [...], ...}, {'input_ids': [...], ...}]
return [
{value: batch_encode_plus_sequences[value][i] for value in batch_encode_plus_sequences.keys()}
for i in range(len(batch_encode_plus_sequences))
]
def test_tokenizers_common_properties(self): def test_tokenizers_common_properties(self):
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
attributes_list = [ attributes_list = [
@ -535,11 +546,8 @@ class TokenizerTesterMixin:
# we're loading an S3 configuration from a pre-trained identifier, and we have no way of testing those today. # we're loading an S3 configuration from a pre-trained identifier, and we have no way of testing those today.
tokenizer = self.get_tokenizer(random_argument=True) tokenizer = self.get_tokenizer(random_argument=True)
print(tokenizer.init_kwargs)
assert tokenizer.init_kwargs["random_argument"] is True assert tokenizer.init_kwargs["random_argument"] is True
new_tokenizer = self.get_tokenizer(random_argument=False) new_tokenizer = self.get_tokenizer(random_argument=False)
print(tokenizer.init_kwargs)
print(new_tokenizer.init_kwargs)
assert tokenizer.init_kwargs["random_argument"] is True assert tokenizer.init_kwargs["random_argument"] is True
assert new_tokenizer.init_kwargs["random_argument"] is False assert new_tokenizer.init_kwargs["random_argument"] is False
@ -562,3 +570,101 @@ class TokenizerTesterMixin:
for word, ind in vocab.items(): for word, ind in vocab.items():
self.assertEqual(tokenizer.convert_tokens_to_ids(word), ind) self.assertEqual(tokenizer.convert_tokens_to_ids(word), ind)
self.assertEqual(tokenizer.convert_ids_to_tokens(ind), word) self.assertEqual(tokenizer.convert_ids_to_tokens(ind), word)
def test_batch_encode_plus_batch_sequence_length(self):
# Tests that all encoded values have the correct size
tokenizer = self.get_tokenizer()
sequences = [
"Testing batch encode plus",
"Testing batch encode plus with different sequence lengths",
"Testing batch encode plus with different sequence lengths correctly pads",
]
encoded_sequences = [tokenizer.encode_plus(sequence, pad_to_max_length=False) for sequence in sequences]
encoded_sequences_batch = tokenizer.batch_encode_plus(sequences)
self.assertListEqual(
encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
)
maximum_length = len(max([encoded_sequence["input_ids"] for encoded_sequence in encoded_sequences], key=len))
encoded_sequences_padded = [
tokenizer.encode_plus(sequence, pad_to_max_length=True, max_length=maximum_length)
for sequence in sequences
]
encoded_sequences_batch_padded = tokenizer.batch_encode_plus(sequences, pad_to_max_length=True)
self.assertListEqual(
encoded_sequences_padded,
self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch_padded),
)
def test_batch_encode_plus_padding(self):
# Test that padded sequences are equivalent between batch_encode_plus and encode_plus
# Right padding tests
tokenizer = self.get_tokenizer()
sequences = [
"Testing batch encode plus",
"Testing batch encode plus with different sequence lengths",
"Testing batch encode plus with different sequence lengths correctly pads",
]
max_length = 100
encoded_sequences = [
tokenizer.encode_plus(sequence, pad_to_max_length=True, max_length=max_length) for sequence in sequences
]
encoded_sequences_batch = tokenizer.batch_encode_plus(sequences, pad_to_max_length=True, max_length=max_length)
self.assertListEqual(
encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
)
# Left padding tests
tokenizer = self.get_tokenizer()
tokenizer.padding_side = "left"
sequences = [
"Testing batch encode plus",
"Testing batch encode plus with different sequence lengths",
"Testing batch encode plus with different sequence lengths correctly pads",
]
max_length = 100
encoded_sequences = [
tokenizer.encode_plus(sequence, pad_to_max_length=True, max_length=max_length) for sequence in sequences
]
encoded_sequences_batch = tokenizer.batch_encode_plus(sequences, pad_to_max_length=True, max_length=max_length)
self.assertListEqual(
encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
)
@require_torch
@require_tf
def test_batch_encode_plus_tensors(self):
tokenizer = self.get_tokenizer()
sequences = [
"Testing batch encode plus",
"Testing batch encode plus with different sequence lengths",
"Testing batch encode plus with different sequence lengths correctly pads",
]
# A Tensor cannot be build by sequences which are not the same size
self.assertRaises(ValueError, tokenizer.batch_encode_plus, sequences, return_tensors="pt")
self.assertRaises(ValueError, tokenizer.batch_encode_plus, sequences, return_tensors="tf")
if tokenizer.pad_token_id is None:
self.assertRaises(
ValueError, tokenizer.batch_encode_plus, sequences, pad_to_max_length=True, return_tensors="pt"
)
self.assertRaises(
ValueError, tokenizer.batch_encode_plus, sequences, pad_to_max_length=True, return_tensors="tf"
)
else:
pytorch_tensor = tokenizer.batch_encode_plus(sequences, pad_to_max_length=True, return_tensors="pt")
tensorflow_tensor = tokenizer.batch_encode_plus(sequences, pad_to_max_length=True, return_tensors="tf")
encoded_sequences = tokenizer.batch_encode_plus(sequences, pad_to_max_length=True)
for key in encoded_sequences.keys():
pytorch_value = pytorch_tensor[key].tolist()
tensorflow_value = tensorflow_tensor[key].numpy().tolist()
encoded_value = encoded_sequences[key]
self.assertEqual(pytorch_value, tensorflow_value, encoded_value)