2019-07-05 17:20:27 +08:00
# coding=utf-8
# Copyright 2019 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
2019-12-22 23:20:32 +08:00
2019-07-05 17:20:27 +08:00
2020-10-19 02:51:24 +08:00
import inspect
2021-05-13 14:44:55 +08:00
import itertools
2021-08-23 20:35:18 +08:00
import json
2019-07-09 21:58:58 +08:00
import os
2019-12-23 01:12:11 +08:00
import pickle
2020-06-16 05:12:51 +08:00
import re
2019-12-21 22:57:32 +08:00
import shutil
2022-02-02 23:44:37 +08:00
import sys
2019-07-09 16:25:18 +08:00
import tempfile
2021-04-23 21:17:37 +08:00
import unittest
2022-03-24 03:56:49 +08:00
import unittest . mock as mock
2020-04-09 04:22:44 +08:00
from collections import OrderedDict
2020-10-19 02:51:24 +08:00
from itertools import takewhile
2022-02-02 23:44:37 +08:00
from pathlib import Path
2021-06-01 21:24:39 +08:00
from typing import TYPE_CHECKING , Any , Dict , List , Tuple , Union
2019-12-21 22:57:32 +08:00
2021-11-09 04:28:25 +08:00
from huggingface_hub import Repository , delete_repo , login
2021-04-23 21:17:37 +08:00
from requests . exceptions import HTTPError
2021-03-16 13:05:37 +08:00
from transformers import (
2021-07-17 21:52:21 +08:00
AlbertTokenizer ,
AlbertTokenizerFast ,
2021-11-09 04:28:25 +08:00
AutoTokenizer ,
2021-04-23 21:17:37 +08:00
BertTokenizer ,
2021-07-17 21:52:21 +08:00
BertTokenizerFast ,
2021-03-16 13:05:37 +08:00
PreTrainedTokenizer ,
PreTrainedTokenizerBase ,
PreTrainedTokenizerFast ,
2021-06-30 03:00:08 +08:00
SpecialTokensMixin ,
2021-09-01 22:32:56 +08:00
Trainer ,
TrainingArguments ,
2021-03-16 13:05:37 +08:00
is_tf_available ,
2021-11-09 04:28:25 +08:00
is_tokenizers_available ,
2021-03-16 13:05:37 +08:00
is_torch_available ,
)
2020-10-23 21:58:19 +08:00
from transformers . testing_utils import (
2021-04-23 21:17:37 +08:00
PASS ,
USER ,
2020-10-23 21:58:19 +08:00
get_tests_dir ,
is_pt_tf_cross_test ,
2021-04-23 21:17:37 +08:00
is_staging_test ,
2020-10-23 21:58:19 +08:00
require_tf ,
require_tokenizers ,
require_torch ,
slow ,
)
2021-09-09 23:26:16 +08:00
from transformers . tokenization_utils import AddedToken , Trie
2020-02-25 01:09:46 +08:00
2019-07-05 17:20:27 +08:00
2021-09-01 22:32:56 +08:00
if is_torch_available ( ) :
import torch . nn as nn
2020-05-15 01:14:26 +08:00
if TYPE_CHECKING :
2020-08-24 23:03:01 +08:00
from transformers import PretrainedConfig , PreTrainedModel , TFPreTrainedModel
2020-05-15 01:14:26 +08:00
2022-02-02 23:44:37 +08:00
sys . path . append ( str ( Path ( __file__ ) . parent . parent / " utils " ) )
from test_module . custom_tokenization import CustomTokenizer # noqa E402
if is_tokenizers_available ( ) :
from test_module . custom_tokenization_fast import CustomTokenizerFast
2020-10-19 02:51:24 +08:00
NON_ENGLISH_TAGS = [ " chinese " , " dutch " , " french " , " finnish " , " german " , " multilingual " ]
2021-06-30 03:00:08 +08:00
SMALL_TRAINING_CORPUS = [
[ " This is the first sentence. " , " This is the second one. " ] ,
[ " This sentence (contains #) over symbols and numbers 12 3. " , " But not this one. " ] ,
]
2020-10-19 02:51:24 +08:00
def filter_non_english ( _ , pretrained_name : str ) :
2021-04-26 19:50:34 +08:00
""" Filter all the model for non-english language """
2020-10-19 02:51:24 +08:00
return not any ( [ lang in pretrained_name for lang in NON_ENGLISH_TAGS ] )
def filter_roberta_detectors ( _ , pretrained_name : str ) :
return " detector " not in pretrained_name
2020-04-09 04:22:44 +08:00
def merge_model_tokenizer_mappings (
2020-04-09 21:09:00 +08:00
model_mapping : Dict [ " PretrainedConfig " , Union [ " PreTrainedModel " , " TFPreTrainedModel " ] ] ,
tokenizer_mapping : Dict [ " PretrainedConfig " , Tuple [ " PreTrainedTokenizer " , " PreTrainedTokenizerFast " ] ] ,
) - > Dict [
Union [ " PreTrainedTokenizer " , " PreTrainedTokenizerFast " ] ,
Tuple [ " PretrainedConfig " , Union [ " PreTrainedModel " , " TFPreTrainedModel " ] ] ,
] :
2020-04-09 04:22:44 +08:00
configurations = list ( model_mapping . keys ( ) )
model_tokenizer_mapping = OrderedDict ( [ ] )
for configuration in configurations :
2021-04-05 21:37:49 +08:00
if configuration in model_mapping and configuration in tokenizer_mapping :
model = model_mapping [ configuration ]
tokenizer = tokenizer_mapping [ configuration ] [ 0 ]
tokenizer_fast = tokenizer_mapping [ configuration ] [ 1 ]
model_tokenizer_mapping . update ( { tokenizer : ( configuration , model ) } )
if tokenizer_fast is not None :
model_tokenizer_mapping . update ( { tokenizer_fast : ( configuration , model ) } )
2020-04-09 04:22:44 +08:00
return model_tokenizer_mapping
2019-12-22 22:34:15 +08:00
class TokenizerTesterMixin :
2019-07-05 17:20:27 +08:00
2019-12-22 22:34:15 +08:00
tokenizer_class = None
2020-10-08 17:32:16 +08:00
rust_tokenizer_class = None
2021-06-14 17:58:44 +08:00
test_slow_tokenizer = True
test_rust_tokenizer = True
2020-10-08 17:32:16 +08:00
space_between_special_tokens = False
2020-10-19 02:51:24 +08:00
from_pretrained_kwargs = None
from_pretrained_filter = None
from_pretrained_vocab_key = " vocab_file "
2021-01-13 07:19:38 +08:00
test_seq2seq = True
2020-10-19 02:51:24 +08:00
2021-05-13 14:44:55 +08:00
# set to True to test a sentencepiece tokenizer
test_sentencepiece = False
# set to True to ignore casing when testing a sentencepiece tokenizer
# test_sentencepiece must also be set to True
test_sentencepiece_ignore_case = False
2020-10-19 02:51:24 +08:00
def setUp ( self ) - > None :
# Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the
# information available in Tokenizer (name, rust class, python class, vocab key name)
if self . test_rust_tokenizer :
tokenizers_list = [
(
self . rust_tokenizer_class ,
pretrained_name ,
self . from_pretrained_kwargs if self . from_pretrained_kwargs is not None else { } ,
)
for pretrained_name in self . rust_tokenizer_class . pretrained_vocab_files_map [
self . from_pretrained_vocab_key
] . keys ( )
if self . from_pretrained_filter is None
or ( self . from_pretrained_filter is not None and self . from_pretrained_filter ( pretrained_name ) )
]
self . tokenizers_list = tokenizers_list [ : 1 ] # Let's just test the first pretrained vocab for speed
else :
self . tokenizers_list = [ ]
with open ( f " { get_tests_dir ( ) } /fixtures/sample_text.txt " , encoding = " utf-8 " ) as f_data :
self . _data = f_data . read ( ) . replace ( " \n \n " , " \n " ) . strip ( )
2019-07-05 17:20:27 +08:00
2019-12-22 22:34:15 +08:00
self . tmpdirname = tempfile . mkdtemp ( )
2019-07-05 17:20:27 +08:00
2019-12-22 22:34:15 +08:00
def tearDown ( self ) :
shutil . rmtree ( self . tmpdirname )
2019-07-05 17:20:27 +08:00
2020-06-16 05:12:51 +08:00
def get_input_output_texts ( self , tokenizer ) :
input_txt = self . get_clean_sequence ( tokenizer ) [ 0 ]
return input_txt , input_txt
2020-10-08 17:32:16 +08:00
def get_clean_sequence ( self , tokenizer , with_prefix_space = False , max_length = 20 , min_length = 5 ) - > Tuple [ str , list ] :
2020-06-16 05:12:51 +08:00
toks = [ ( i , tokenizer . decode ( [ i ] , clean_up_tokenization_spaces = False ) ) for i in range ( len ( tokenizer ) ) ]
toks = list ( filter ( lambda t : re . match ( r " ^[ a-zA-Z]+$ " , t [ 1 ] ) , toks ) )
toks = list ( filter ( lambda t : [ t [ 0 ] ] == tokenizer . encode ( t [ 1 ] , add_special_tokens = False ) , toks ) )
if max_length is not None and len ( toks ) > max_length :
toks = toks [ : max_length ]
2020-10-08 17:32:16 +08:00
if min_length is not None and len ( toks ) < min_length and len ( toks ) > 0 :
while len ( toks ) < min_length :
toks = toks + toks
2020-06-16 05:12:51 +08:00
# toks_str = [t[1] for t in toks]
toks_ids = [ t [ 0 ] for t in toks ]
# Ensure consistency
output_txt = tokenizer . decode ( toks_ids , clean_up_tokenization_spaces = False )
if " " not in output_txt and len ( toks_ids ) > 1 :
output_txt = (
tokenizer . decode ( [ toks_ids [ 0 ] ] , clean_up_tokenization_spaces = False )
+ " "
+ tokenizer . decode ( toks_ids [ 1 : ] , clean_up_tokenization_spaces = False )
)
if with_prefix_space :
output_txt = " " + output_txt
output_ids = tokenizer . encode ( output_txt , add_special_tokens = False )
return output_txt , output_ids
2020-06-25 03:53:08 +08:00
def get_tokenizers ( self , fast = True , * * kwargs ) - > List [ PreTrainedTokenizerBase ] :
2021-06-14 17:58:44 +08:00
if fast and self . test_rust_tokenizer and self . test_slow_tokenizer :
2020-06-16 05:12:51 +08:00
return [ self . get_tokenizer ( * * kwargs ) , self . get_rust_tokenizer ( * * kwargs ) ]
2021-06-14 17:58:44 +08:00
elif fast and self . test_rust_tokenizer :
return [ self . get_rust_tokenizer ( * * kwargs ) ]
elif self . test_slow_tokenizer :
return [ self . get_tokenizer ( * * kwargs ) ]
else :
raise ValueError ( " This tokenizer class has no tokenizer to be tested. " )
2020-06-16 05:12:51 +08:00
2020-05-19 22:46:55 +08:00
def get_tokenizer ( self , * * kwargs ) - > PreTrainedTokenizer :
return self . tokenizer_class . from_pretrained ( self . tmpdirname , * * kwargs )
2019-07-05 17:20:27 +08:00
2020-06-25 03:53:08 +08:00
def get_rust_tokenizer ( self , * * kwargs ) - > PreTrainedTokenizerFast :
2020-10-08 17:32:16 +08:00
return self . rust_tokenizer_class . from_pretrained ( self . tmpdirname , * * kwargs )
2019-07-05 17:20:27 +08:00
2021-06-01 21:24:39 +08:00
def tokenizer_integration_test_util (
self ,
expected_encoding : Dict ,
model_name : str ,
revision : str = None ,
sequences : List [ str ] = None ,
decode_kwargs : Dict [ str , Any ] = None ,
padding : bool = True ,
) :
"""
Util for integration test .
Text is tokenized and then reverted back to text . Both results are then checked .
Args :
expected_encoding :
The expected result of the tokenizer output .
model_name :
The model name of the tokenizer to load and use .
revision :
The full git revision number of the model . This is to pin the
tokenizer config and to avoid that tests start to fail if the
config gets changed upstream .
sequences :
Can overwrite the texts that are used to check the tokenizer .
This is useful if the tokenizer supports non english languages
like france .
decode_kwargs :
Additional args for the ` ` decode ` ` function which reverts the
tokenized text back to a string .
padding :
Activates and controls padding of the tokenizer .
"""
decode_kwargs = { } if decode_kwargs is None else decode_kwargs
if sequences is None :
sequences = [
" Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides "
" general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural "
" Language Understanding (NLU) and Natural Language Generation (NLG) with over 32+ pretrained "
" models in 100+ languages and deep interoperability between Jax, PyTorch and TensorFlow. " ,
" BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly "
" conditioning on both left and right context in all layers. " ,
" The quick brown fox jumps over the lazy dog. " ,
]
2021-06-07 16:55:55 +08:00
if self . test_sentencepiece_ignore_case :
sequences = [ sequence . lower ( ) for sequence in sequences ]
2021-06-01 21:24:39 +08:00
tokenizer_classes = [ self . tokenizer_class ]
if self . test_rust_tokenizer :
tokenizer_classes . append ( self . rust_tokenizer_class )
for tokenizer_class in tokenizer_classes :
tokenizer = tokenizer_class . from_pretrained (
model_name ,
revision = revision , # to pin the tokenizer version
)
encoding = tokenizer ( sequences , padding = padding )
decoded_sequences = [
tokenizer . decode ( seq , skip_special_tokens = True , * * decode_kwargs ) for seq in encoding [ " input_ids " ]
]
encoding_data = encoding . data
self . assertDictEqual ( encoding_data , expected_encoding )
for expected , decoded in zip ( sequences , decoded_sequences ) :
if self . test_sentencepiece_ignore_case :
expected = expected . lower ( )
self . assertEqual ( expected , decoded )
2019-07-09 21:58:58 +08:00
2021-02-02 15:35:27 +08:00
def assert_padded_input_match ( self , input_r : list , input_p : list , max_length : int , pad_token_id : int ) :
# Ensure we match max_length
self . assertEqual ( len ( input_r ) , max_length )
self . assertEqual ( len ( input_p ) , max_length )
# Ensure the number of padded tokens is the same
padded_tokens_r = list ( takewhile ( lambda i : i == pad_token_id , reversed ( input_r ) ) )
padded_tokens_p = list ( takewhile ( lambda i : i == pad_token_id , reversed ( input_p ) ) )
self . assertSequenceEqual ( padded_tokens_r , padded_tokens_p )
def assert_batch_padded_input_match (
self ,
input_r : dict ,
input_p : dict ,
max_length : int ,
pad_token_id : int ,
model_main_input_name : str = " input_ids " ,
) :
for i_r in input_r . values ( ) :
self . assertEqual ( len ( i_r ) , 2 ) , self . assertEqual ( len ( i_r [ 0 ] ) , max_length ) , self . assertEqual (
len ( i_r [ 1 ] ) , max_length
)
self . assertEqual ( len ( i_r ) , 2 ) , self . assertEqual ( len ( i_r [ 0 ] ) , max_length ) , self . assertEqual (
len ( i_r [ 1 ] ) , max_length
)
for i_r , i_p in zip ( input_r [ model_main_input_name ] , input_p [ model_main_input_name ] ) :
self . assert_padded_input_match ( i_r , i_p , max_length , pad_token_id )
for i_r , i_p in zip ( input_r [ " attention_mask " ] , input_p [ " attention_mask " ] ) :
self . assertSequenceEqual ( i_r , i_p )
2020-02-25 01:09:46 +08:00
@staticmethod
def convert_batch_encode_plus_format_to_encode_plus ( batch_encode_plus_sequences ) :
# Switch from batch_encode_plus format: {'input_ids': [[...], [...]], ...}
2020-05-19 22:46:55 +08:00
# to the list of examples/ encode_plus format: [{'input_ids': [...], ...}, {'input_ids': [...], ...}]
2020-02-25 01:09:46 +08:00
return [
{ value : batch_encode_plus_sequences [ value ] [ i ] for value in batch_encode_plus_sequences . keys ( ) }
2020-03-10 01:48:58 +08:00
for i in range ( len ( batch_encode_plus_sequences [ " input_ids " ] ) )
2020-02-25 01:09:46 +08:00
]
2021-09-17 22:28:28 +08:00
# TODO: this test can be combined with `test_sentencepiece_tokenize_and_convert_tokens_to_string` after the latter is extended to all tokenizers.
def test_tokenize_special_tokens ( self ) :
""" Test `tokenize` with special tokens. """
2021-12-03 15:39:10 +08:00
tokenizers = self . get_tokenizers ( fast = True , do_lower_case = True )
2021-09-17 22:28:28 +08:00
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
SPECIAL_TOKEN_1 = " [SPECIAL_TOKEN_1] "
SPECIAL_TOKEN_2 = " [SPECIAL_TOKEN_2] "
# TODO:
# Can we combine `unique_no_split_tokens` and `all_special_tokens`(and properties related to it)
# with one variable(property) for a better maintainability?
# `add_tokens` method stores special tokens only in `tokenizer.unique_no_split_tokens`. (in tokenization_utils.py)
tokenizer . add_tokens ( [ SPECIAL_TOKEN_1 ] , special_tokens = True )
# `add_special_tokens` method stores special tokens in `tokenizer.additional_special_tokens`,
# which also occur in `tokenizer.all_special_tokens`. (in tokenization_utils_base.py)
tokenizer . add_special_tokens ( { " additional_special_tokens " : [ SPECIAL_TOKEN_2 ] } )
token_1 = tokenizer . tokenize ( SPECIAL_TOKEN_1 )
token_2 = tokenizer . tokenize ( SPECIAL_TOKEN_2 )
self . assertEqual ( len ( token_1 ) , 1 )
self . assertEqual ( len ( token_2 ) , 1 )
self . assertEqual ( token_1 [ 0 ] , SPECIAL_TOKEN_1 )
self . assertEqual ( token_2 [ 0 ] , SPECIAL_TOKEN_2 )
2021-06-01 21:24:39 +08:00
# TODO: this test could be extended to all tokenizers - not just the sentencepiece
def test_sentencepiece_tokenize_and_convert_tokens_to_string ( self ) :
""" Test ``_tokenize`` and ``convert_tokens_to_string``. """
if not self . test_sentencepiece :
return
tokenizer = self . get_tokenizer ( )
text = " This is text to test the tokenizer. "
if self . test_sentencepiece_ignore_case :
text = text . lower ( )
tokens = tokenizer . tokenize ( text )
self . assertTrue ( len ( tokens ) > 0 )
# check if converting back to original text works
reverse_text = tokenizer . convert_tokens_to_string ( tokens )
if self . test_sentencepiece_ignore_case :
reverse_text = reverse_text . lower ( )
self . assertEqual ( reverse_text , text )
2021-05-13 14:44:55 +08:00
def test_subword_regularization_tokenizer ( self ) - > None :
if not self . test_sentencepiece :
return
# Subword regularization is only available for the slow tokenizer.
sp_model_kwargs = { " enable_sampling " : True , " alpha " : 0.1 , " nbest_size " : - 1 }
tokenizer = self . get_tokenizer ( sp_model_kwargs = sp_model_kwargs )
self . assertTrue ( hasattr ( tokenizer , " sp_model_kwargs " ) )
self . assertIsNotNone ( tokenizer . sp_model_kwargs )
self . assertTrue ( isinstance ( tokenizer . sp_model_kwargs , dict ) )
self . assertEqual ( tokenizer . sp_model_kwargs , sp_model_kwargs )
self . check_subword_sampling ( tokenizer )
def test_pickle_subword_regularization_tokenizer ( self ) - > None :
if not self . test_sentencepiece :
return
""" Google pickle __getstate__ __setstate__ if you are struggling with this. """
# Subword regularization is only available for the slow tokenizer.
sp_model_kwargs = { " enable_sampling " : True , " alpha " : 0.1 , " nbest_size " : - 1 }
tokenizer = self . get_tokenizer ( sp_model_kwargs = sp_model_kwargs )
tokenizer_bin = pickle . dumps ( tokenizer )
del tokenizer
tokenizer_new = pickle . loads ( tokenizer_bin )
self . assertTrue ( hasattr ( tokenizer_new , " sp_model_kwargs " ) )
self . assertIsNotNone ( tokenizer_new . sp_model_kwargs )
self . assertTrue ( isinstance ( tokenizer_new . sp_model_kwargs , dict ) )
self . assertEqual ( tokenizer_new . sp_model_kwargs , sp_model_kwargs )
self . check_subword_sampling ( tokenizer_new )
2022-01-27 23:24:51 +08:00
def test_save_sentencepiece_tokenizer ( self ) - > None :
if not self . test_sentencepiece or not self . test_slow_tokenizer :
return
# We want to verify that we will be able to save the tokenizer even if the original files that were used to
# build the tokenizer have been deleted in the meantime.
text = " This is text to test the tokenizer. "
tokenizer_slow_1 = self . get_tokenizer ( )
encoding_tokenizer_slow_1 = tokenizer_slow_1 ( text )
tmpdirname_1 = tempfile . mkdtemp ( )
tmpdirname_2 = tempfile . mkdtemp ( )
tokenizer_slow_1 . save_pretrained ( tmpdirname_1 )
tokenizer_slow_2 = self . tokenizer_class . from_pretrained ( tmpdirname_1 )
encoding_tokenizer_slow_2 = tokenizer_slow_2 ( text )
shutil . rmtree ( tmpdirname_1 )
tokenizer_slow_2 . save_pretrained ( tmpdirname_2 )
tokenizer_slow_3 = self . tokenizer_class . from_pretrained ( tmpdirname_2 )
encoding_tokenizer_slow_3 = tokenizer_slow_3 ( text )
shutil . rmtree ( tmpdirname_2 )
self . assertEqual ( encoding_tokenizer_slow_1 , encoding_tokenizer_slow_2 )
self . assertEqual ( encoding_tokenizer_slow_1 , encoding_tokenizer_slow_3 )
2021-02-02 15:35:27 +08:00
def test_model_input_names_signature ( self ) :
accepted_model_main_input_names = [
" input_ids " , # nlp models
" input_values " , # speech models
]
tokenizers = self . get_tokenizers ( )
for tokenizer in tokenizers :
# first name of model_input_names has to correspond to main model input name
# to make sure `tokenizer.pad(...)` works correctly
self . assertTrue ( tokenizer . model_input_names [ 0 ] in accepted_model_main_input_names )
2020-10-19 02:51:24 +08:00
def test_rust_tokenizer_signature ( self ) :
if not self . test_rust_tokenizer :
return
signature = inspect . signature ( self . rust_tokenizer_class . __init__ )
self . assertIn ( " tokenizer_file " , signature . parameters )
self . assertIsNone ( signature . parameters [ " tokenizer_file " ] . default )
2020-10-26 17:27:48 +08:00
def test_tokenizer_slow_store_full_signature ( self ) :
2021-06-14 17:58:44 +08:00
if not self . test_slow_tokenizer :
return
2020-10-26 17:27:48 +08:00
signature = inspect . signature ( self . tokenizer_class . __init__ )
tokenizer = self . get_tokenizer ( )
for parameter_name , parameter in signature . parameters . items ( ) :
if parameter . default != inspect . Parameter . empty :
self . assertIn ( parameter_name , tokenizer . init_kwargs )
def test_tokenizer_fast_store_full_signature ( self ) :
if not self . test_rust_tokenizer :
return
signature = inspect . signature ( self . rust_tokenizer_class . __init__ )
tokenizer = self . get_rust_tokenizer ( )
for parameter_name , parameter in signature . parameters . items ( ) :
2021-06-30 03:00:08 +08:00
if parameter . default != inspect . Parameter . empty and parameter_name not in [
" vocab_file " ,
" merges_file " ,
" tokenizer_file " ,
] :
2020-10-26 17:27:48 +08:00
self . assertIn ( parameter_name , tokenizer . init_kwargs )
2020-10-08 17:32:16 +08:00
def test_rust_and_python_full_tokenizers ( self ) :
if not self . test_rust_tokenizer :
return
2021-06-14 17:58:44 +08:00
if not self . test_slow_tokenizer :
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return
2020-10-08 17:32:16 +08:00
tokenizer = self . get_tokenizer ( )
rust_tokenizer = self . get_rust_tokenizer ( )
sequence , _ = self . get_input_output_texts ( tokenizer )
# We don't have an exact equivalence on `tokenize()` between Rust and Slow
# Slow tokenizer only split tokens, Rust tokenizers will replace with <unk>
# tokens = tokenizer.tokenize(sequence)
# rust_tokens = rust_tokenizer.tokenize(sequence)
# self.assertListEqual(tokens, rust_tokens)
ids = tokenizer . encode ( sequence , add_special_tokens = False )
rust_ids = rust_tokenizer . encode ( sequence , add_special_tokens = False )
self . assertListEqual ( ids , rust_ids )
ids = tokenizer . encode ( sequence , add_special_tokens = True )
rust_ids = rust_tokenizer . encode ( sequence , add_special_tokens = True )
self . assertListEqual ( ids , rust_ids )
2019-12-22 22:34:15 +08:00
def test_tokenizers_common_properties ( self ) :
2020-06-16 05:12:51 +08:00
tokenizers = self . get_tokenizers ( )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
attributes_list = [
" bos_token " ,
" eos_token " ,
" unk_token " ,
" sep_token " ,
" pad_token " ,
" cls_token " ,
" mask_token " ,
]
for attr in attributes_list :
self . assertTrue ( hasattr ( tokenizer , attr ) )
self . assertTrue ( hasattr ( tokenizer , attr + " _id " ) )
self . assertTrue ( hasattr ( tokenizer , " additional_special_tokens " ) )
self . assertTrue ( hasattr ( tokenizer , " additional_special_tokens_ids " ) )
attributes_list = [
" model_max_length " ,
" init_inputs " ,
" init_kwargs " ,
]
if not isinstance ( tokenizer , PreTrainedTokenizerFast ) :
attributes_list + = [
" added_tokens_encoder " ,
" added_tokens_decoder " ,
]
for attr in attributes_list :
self . assertTrue ( hasattr ( tokenizer , attr ) )
2019-09-06 03:31:29 +08:00
2019-12-22 22:34:15 +08:00
def test_save_and_load_tokenizer ( self ) :
# safety check on max_len default value so we are sure the test works
2020-06-25 03:53:08 +08:00
tokenizers = self . get_tokenizers ( )
2020-06-16 05:12:51 +08:00
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
2020-09-14 19:51:19 +08:00
self . assertNotEqual ( tokenizer . model_max_length , 42 )
2019-08-30 23:09:36 +08:00
2019-12-22 22:34:15 +08:00
# Now let's start the test
2020-06-25 03:53:08 +08:00
tokenizers = self . get_tokenizers ( )
2020-06-16 05:12:51 +08:00
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
2020-06-25 03:53:08 +08:00
# Isolate this from the other tests because we save additional tokens/etc
tmpdirname = tempfile . mkdtemp ( )
sample_text = " He is very happy, UNwant \u00E9 d,running "
2020-06-16 05:12:51 +08:00
before_tokens = tokenizer . encode ( sample_text , add_special_tokens = False )
2020-06-25 03:53:08 +08:00
before_vocab = tokenizer . get_vocab ( )
tokenizer . save_pretrained ( tmpdirname )
after_tokenizer = tokenizer . __class__ . from_pretrained ( tmpdirname )
after_tokens = after_tokenizer . encode ( sample_text , add_special_tokens = False )
after_vocab = after_tokenizer . get_vocab ( )
self . assertListEqual ( before_tokens , after_tokens )
self . assertDictEqual ( before_vocab , after_vocab )
shutil . rmtree ( tmpdirname )
2019-07-05 17:20:27 +08:00
2020-06-25 03:53:08 +08:00
tokenizers = self . get_tokenizers ( model_max_length = 42 )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
# Isolate this from the other tests because we save additional tokens/etc
tmpdirname = tempfile . mkdtemp ( )
sample_text = " He is very happy, UNwant \u00E9 d,running "
tokenizer . add_tokens ( [ " bim " , " bambam " ] )
additional_special_tokens = tokenizer . additional_special_tokens
additional_special_tokens . append ( " new_additional_special_token " )
tokenizer . add_special_tokens ( { " additional_special_tokens " : additional_special_tokens } )
before_tokens = tokenizer . encode ( sample_text , add_special_tokens = False )
before_vocab = tokenizer . get_vocab ( )
tokenizer . save_pretrained ( tmpdirname )
2019-07-05 17:20:27 +08:00
2020-06-25 03:53:08 +08:00
after_tokenizer = tokenizer . __class__ . from_pretrained ( tmpdirname )
after_tokens = after_tokenizer . encode ( sample_text , add_special_tokens = False )
after_vocab = after_tokenizer . get_vocab ( )
2020-06-16 05:12:51 +08:00
self . assertListEqual ( before_tokens , after_tokens )
2020-06-25 03:53:08 +08:00
self . assertDictEqual ( before_vocab , after_vocab )
self . assertIn ( " bim " , after_vocab )
self . assertIn ( " bambam " , after_vocab )
self . assertIn ( " new_additional_special_token " , after_tokenizer . additional_special_tokens )
self . assertEqual ( after_tokenizer . model_max_length , 42 )
2019-08-30 23:09:36 +08:00
2020-06-25 03:53:08 +08:00
tokenizer = tokenizer . __class__ . from_pretrained ( tmpdirname , model_max_length = 43 )
2020-06-16 05:12:51 +08:00
self . assertEqual ( tokenizer . model_max_length , 43 )
2019-07-05 17:20:27 +08:00
2020-06-25 03:53:08 +08:00
shutil . rmtree ( tmpdirname )
2020-10-19 02:51:24 +08:00
# Test that we can also use the non-legacy saving format for fast tokenizers
tokenizers = self . get_tokenizers ( model_max_length = 42 )
for tokenizer in tokenizers :
if not tokenizer . is_fast :
continue
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
# Isolate this from the other tests because we save additional tokens/etc
tmpdirname = tempfile . mkdtemp ( )
sample_text = " He is very happy, UNwant \u00E9 d,running "
tokenizer . add_tokens ( [ " bim " , " bambam " ] )
additional_special_tokens = tokenizer . additional_special_tokens
additional_special_tokens . append ( " new_additional_special_token " )
tokenizer . add_special_tokens ( { " additional_special_tokens " : additional_special_tokens } )
before_tokens = tokenizer . encode ( sample_text , add_special_tokens = False )
before_vocab = tokenizer . get_vocab ( )
tokenizer . save_pretrained ( tmpdirname )
after_tokenizer = tokenizer . __class__ . from_pretrained ( tmpdirname )
after_tokens = after_tokenizer . encode ( sample_text , add_special_tokens = False )
after_vocab = after_tokenizer . get_vocab ( )
self . assertListEqual ( before_tokens , after_tokens )
self . assertDictEqual ( before_vocab , after_vocab )
self . assertIn ( " bim " , after_vocab )
self . assertIn ( " bambam " , after_vocab )
self . assertIn ( " new_additional_special_token " , after_tokenizer . additional_special_tokens )
self . assertEqual ( after_tokenizer . model_max_length , 42 )
tokenizer = tokenizer . __class__ . from_pretrained ( tmpdirname , model_max_length = 43 )
self . assertEqual ( tokenizer . model_max_length , 43 )
shutil . rmtree ( tmpdirname )
2019-12-22 22:34:15 +08:00
def test_pickle_tokenizer ( self ) :
2020-05-19 22:46:55 +08:00
""" Google pickle __getstate__ __setstate__ if you are struggling with this. """
2020-06-16 05:12:51 +08:00
tokenizers = self . get_tokenizers ( )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
self . assertIsNotNone ( tokenizer )
2019-07-05 17:20:27 +08:00
2020-06-16 05:12:51 +08:00
text = " Munich and Berlin are nice cities "
subwords = tokenizer . tokenize ( text )
2019-07-09 16:25:18 +08:00
2020-06-16 05:12:51 +08:00
filename = os . path . join ( self . tmpdirname , " tokenizer.bin " )
with open ( filename , " wb " ) as handle :
pickle . dump ( tokenizer , handle )
2019-07-09 16:25:18 +08:00
2020-06-16 05:12:51 +08:00
with open ( filename , " rb " ) as handle :
tokenizer_new = pickle . load ( handle )
2019-07-09 16:25:18 +08:00
2020-06-16 05:12:51 +08:00
subwords_loaded = tokenizer_new . tokenize ( text )
2019-07-09 16:25:18 +08:00
2020-06-16 05:12:51 +08:00
self . assertListEqual ( subwords , subwords_loaded )
2019-07-09 16:25:18 +08:00
2020-10-19 02:51:24 +08:00
@require_tokenizers
2020-07-07 06:27:53 +08:00
def test_pickle_added_tokens ( self ) :
tok1 = AddedToken ( " <s> " , rstrip = True , lstrip = True , normalized = False , single_word = True )
tok2 = pickle . loads ( pickle . dumps ( tok1 ) )
self . assertEqual ( tok1 . __getstate__ ( ) , tok2 . __getstate__ ( ) )
2019-12-22 22:34:15 +08:00
def test_added_tokens_do_lower_case ( self ) :
2021-12-03 15:39:10 +08:00
tokenizers = self . get_tokenizers ( do_lower_case = True )
2020-06-16 05:12:51 +08:00
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
2020-10-08 17:32:16 +08:00
if not hasattr ( tokenizer , " do_lower_case " ) or not tokenizer . do_lower_case :
continue
2020-06-16 05:12:51 +08:00
special_token = tokenizer . all_special_tokens [ 0 ]
2019-12-04 23:53:25 +08:00
2020-06-16 05:12:51 +08:00
text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token
text2 = special_token + " AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l " + special_token
Consider do_lower_case in PreTrainedTokenizer
As pointed out in #1545, when using an uncased model, and adding
a new uncased token, the tokenizer does not correctly identify this
in the case that the input text contains the token in a cased format.
For instance, if we load bert-base-uncased into BertTokenizer, and
then use .add_tokens() to add "cool-token", we get the expected
result for .tokenize('this is a cool-token'). However, we get a
possibly unexpected result for .tokenize('this is a cOOl-Token'),
which in fact mirrors the result for the former from before the new
token was added.
This commit adds
- functionality to PreTrainedTokenizer to handle this
situation in case a tokenizer (currently Bert, DistilBert,
and XLNet) has the do_lower_case=True kwarg by:
1) lowercasing tokens added with .add_tokens()
2) lowercasing text at the beginning of .tokenize()
- new common test case for tokenizers
https://github.com/huggingface/transformers/issues/1545
2019-11-06 19:18:16 +08:00
2021-12-03 15:39:10 +08:00
toks_before_adding = tokenizer . tokenize ( text ) # toks before adding new_toks
Consider do_lower_case in PreTrainedTokenizer
As pointed out in #1545, when using an uncased model, and adding
a new uncased token, the tokenizer does not correctly identify this
in the case that the input text contains the token in a cased format.
For instance, if we load bert-base-uncased into BertTokenizer, and
then use .add_tokens() to add "cool-token", we get the expected
result for .tokenize('this is a cool-token'). However, we get a
possibly unexpected result for .tokenize('this is a cOOl-Token'),
which in fact mirrors the result for the former from before the new
token was added.
This commit adds
- functionality to PreTrainedTokenizer to handle this
situation in case a tokenizer (currently Bert, DistilBert,
and XLNet) has the do_lower_case=True kwarg by:
1) lowercasing tokens added with .add_tokens()
2) lowercasing text at the beginning of .tokenize()
- new common test case for tokenizers
https://github.com/huggingface/transformers/issues/1545
2019-11-06 19:18:16 +08:00
2020-06-16 05:12:51 +08:00
new_toks = [ " aaaaa bbbbbb " , " cccccccccdddddddd " , " AAAAA BBBBBB " , " CCCCCCCCCDDDDDDDD " ]
2021-12-03 15:39:10 +08:00
added = tokenizer . add_tokens ( [ AddedToken ( tok , lstrip = True , rstrip = True ) for tok in new_toks ] )
Consider do_lower_case in PreTrainedTokenizer
As pointed out in #1545, when using an uncased model, and adding
a new uncased token, the tokenizer does not correctly identify this
in the case that the input text contains the token in a cased format.
For instance, if we load bert-base-uncased into BertTokenizer, and
then use .add_tokens() to add "cool-token", we get the expected
result for .tokenize('this is a cool-token'). However, we get a
possibly unexpected result for .tokenize('this is a cOOl-Token'),
which in fact mirrors the result for the former from before the new
token was added.
This commit adds
- functionality to PreTrainedTokenizer to handle this
situation in case a tokenizer (currently Bert, DistilBert,
and XLNet) has the do_lower_case=True kwarg by:
1) lowercasing tokens added with .add_tokens()
2) lowercasing text at the beginning of .tokenize()
- new common test case for tokenizers
https://github.com/huggingface/transformers/issues/1545
2019-11-06 19:18:16 +08:00
2021-12-03 15:39:10 +08:00
toks_after_adding = tokenizer . tokenize ( text )
toks_after_adding2 = tokenizer . tokenize ( text2 )
Consider do_lower_case in PreTrainedTokenizer
As pointed out in #1545, when using an uncased model, and adding
a new uncased token, the tokenizer does not correctly identify this
in the case that the input text contains the token in a cased format.
For instance, if we load bert-base-uncased into BertTokenizer, and
then use .add_tokens() to add "cool-token", we get the expected
result for .tokenize('this is a cool-token'). However, we get a
possibly unexpected result for .tokenize('this is a cOOl-Token'),
which in fact mirrors the result for the former from before the new
token was added.
This commit adds
- functionality to PreTrainedTokenizer to handle this
situation in case a tokenizer (currently Bert, DistilBert,
and XLNet) has the do_lower_case=True kwarg by:
1) lowercasing tokens added with .add_tokens()
2) lowercasing text at the beginning of .tokenize()
- new common test case for tokenizers
https://github.com/huggingface/transformers/issues/1545
2019-11-06 19:18:16 +08:00
2021-12-03 15:39:10 +08:00
# Rust tokenizers dont't lowercase added tokens at the time calling `tokenizer.add_tokens`,
# while python tokenizers do, so new_toks 0 and 2 would be treated as the same, so do new_toks 1 and 3.
self . assertIn ( added , [ 2 , 4 ] )
self . assertListEqual ( toks_after_adding , toks_after_adding2 )
self . assertTrue (
len ( toks_before_adding ) > len ( toks_after_adding ) , # toks_before_adding should be longer
)
Consider do_lower_case in PreTrainedTokenizer
As pointed out in #1545, when using an uncased model, and adding
a new uncased token, the tokenizer does not correctly identify this
in the case that the input text contains the token in a cased format.
For instance, if we load bert-base-uncased into BertTokenizer, and
then use .add_tokens() to add "cool-token", we get the expected
result for .tokenize('this is a cool-token'). However, we get a
possibly unexpected result for .tokenize('this is a cOOl-Token'),
which in fact mirrors the result for the former from before the new
token was added.
This commit adds
- functionality to PreTrainedTokenizer to handle this
situation in case a tokenizer (currently Bert, DistilBert,
and XLNet) has the do_lower_case=True kwarg by:
1) lowercasing tokens added with .add_tokens()
2) lowercasing text at the beginning of .tokenize()
- new common test case for tokenizers
https://github.com/huggingface/transformers/issues/1545
2019-11-06 19:18:16 +08:00
2020-06-16 05:12:51 +08:00
# Check that none of the special tokens are lowercased
sequence_with_special_tokens = " A " + " yEs " . join ( tokenizer . all_special_tokens ) + " B "
2021-12-03 15:39:10 +08:00
# Convert the tokenized list to str as some special tokens are tokenized like normal tokens
# which have a prefix spacee e.g. the mask token of Albert, and cannot match the original
# special tokens exactly.
tokenized_sequence = " " . join ( tokenizer . tokenize ( sequence_with_special_tokens ) )
2019-07-09 16:25:18 +08:00
2020-06-16 05:12:51 +08:00
for special_token in tokenizer . all_special_tokens :
self . assertTrue ( special_token in tokenized_sequence )
2019-07-09 16:25:18 +08:00
2021-12-03 15:39:10 +08:00
tokenizers = self . get_tokenizers ( do_lower_case = True )
2020-06-16 05:12:51 +08:00
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
2020-10-08 17:32:16 +08:00
if hasattr ( tokenizer , " do_lower_case " ) and tokenizer . do_lower_case :
continue
2020-06-16 05:12:51 +08:00
special_token = tokenizer . all_special_tokens [ 0 ]
2019-07-09 16:25:18 +08:00
2020-06-16 05:12:51 +08:00
text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token
text2 = special_token + " AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l " + special_token
2019-07-09 16:25:18 +08:00
2021-12-03 15:39:10 +08:00
toks_before_adding = tokenizer . tokenize ( text ) # toks before adding new_toks
2019-09-02 08:27:39 +08:00
2021-12-03 15:39:10 +08:00
new_toks = [ " aaaaa bbbbbb " , " cccccccccdddddddd " , " AAAAA BBBBBB " , " CCCCCCCCCDDDDDDDD " ]
added = tokenizer . add_tokens ( [ AddedToken ( tok , lstrip = True , rstrip = True ) for tok in new_toks ] )
2020-10-08 17:32:16 +08:00
self . assertIn ( added , [ 2 , 4 ] )
2019-07-05 17:20:27 +08:00
2021-12-03 15:39:10 +08:00
toks_after_adding = tokenizer . tokenize ( text )
toks_after_adding2 = tokenizer . tokenize ( text2 )
2019-07-05 17:20:27 +08:00
2021-12-03 15:39:10 +08:00
self . assertEqual ( len ( toks_after_adding ) , len ( toks_after_adding2 ) ) # Length should still be the same
self . assertNotEqual (
toks_after_adding [ 1 ] , toks_after_adding2 [ 1 ]
) # But at least the first non-special tokens should differ
self . assertTrue (
len ( toks_before_adding ) > len ( toks_after_adding ) , # toks_before_adding should be longer
)
2019-07-15 23:30:42 +08:00
2020-06-16 05:12:51 +08:00
def test_add_tokens_tokenizer ( self ) :
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
vocab_size = tokenizer . vocab_size
all_size = len ( tokenizer )
self . assertNotEqual ( vocab_size , 0 )
2020-06-25 03:53:08 +08:00
# We usually have added tokens from the start in tests because our vocab fixtures are
# smaller than the original vocabs - let's not assert this
# self.assertEqual(vocab_size, all_size)
2020-06-16 05:12:51 +08:00
new_toks = [ " aaaaa bbbbbb " , " cccccccccdddddddd " ]
added_toks = tokenizer . add_tokens ( new_toks )
vocab_size_2 = tokenizer . vocab_size
all_size_2 = len ( tokenizer )
self . assertNotEqual ( vocab_size_2 , 0 )
self . assertEqual ( vocab_size , vocab_size_2 )
self . assertEqual ( added_toks , len ( new_toks ) )
self . assertEqual ( all_size_2 , all_size + len ( new_toks ) )
tokens = tokenizer . encode ( " aaaaa bbbbbb low cccccccccdddddddd l " , add_special_tokens = False )
self . assertGreaterEqual ( len ( tokens ) , 4 )
self . assertGreater ( tokens [ 0 ] , tokenizer . vocab_size - 1 )
self . assertGreater ( tokens [ - 2 ] , tokenizer . vocab_size - 1 )
new_toks_2 = { " eos_token " : " >>>>|||<||<<|<< " , " pad_token " : " <<<<<|||>|>>>>|> " }
added_toks_2 = tokenizer . add_special_tokens ( new_toks_2 )
vocab_size_3 = tokenizer . vocab_size
all_size_3 = len ( tokenizer )
self . assertNotEqual ( vocab_size_3 , 0 )
self . assertEqual ( vocab_size , vocab_size_3 )
self . assertEqual ( added_toks_2 , len ( new_toks_2 ) )
self . assertEqual ( all_size_3 , all_size_2 + len ( new_toks_2 ) )
tokens = tokenizer . encode (
" >>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l " , add_special_tokens = False
)
self . assertGreaterEqual ( len ( tokens ) , 6 )
self . assertGreater ( tokens [ 0 ] , tokenizer . vocab_size - 1 )
self . assertGreater ( tokens [ 0 ] , tokens [ 1 ] )
self . assertGreater ( tokens [ - 2 ] , tokenizer . vocab_size - 1 )
self . assertGreater ( tokens [ - 2 ] , tokens [ - 3 ] )
self . assertEqual ( tokens [ 0 ] , tokenizer . eos_token_id )
self . assertEqual ( tokens [ - 2 ] , tokenizer . pad_token_id )
2019-07-16 21:11:29 +08:00
2019-12-22 22:34:15 +08:00
def test_add_special_tokens ( self ) :
2020-06-16 05:12:51 +08:00
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
input_text , ids = self . get_clean_sequence ( tokenizer )
2019-11-13 04:27:57 +08:00
2020-06-16 05:12:51 +08:00
special_token = " [SPECIAL_TOKEN] "
2019-11-13 04:27:57 +08:00
2020-06-16 05:12:51 +08:00
tokenizer . add_special_tokens ( { " cls_token " : special_token } )
encoded_special_token = tokenizer . encode ( special_token , add_special_tokens = False )
self . assertEqual ( len ( encoded_special_token ) , 1 )
2019-11-13 04:27:57 +08:00
2020-06-16 05:12:51 +08:00
text = tokenizer . decode ( ids + encoded_special_token , clean_up_tokenization_spaces = False )
encoded = tokenizer . encode ( text , add_special_tokens = False )
2019-11-13 04:27:57 +08:00
2020-06-16 05:12:51 +08:00
input_encoded = tokenizer . encode ( input_text , add_special_tokens = False )
special_token_id = tokenizer . encode ( special_token , add_special_tokens = False )
self . assertEqual ( encoded , input_encoded + special_token_id )
2019-11-13 04:27:57 +08:00
2020-06-16 05:12:51 +08:00
decoded = tokenizer . decode ( encoded , skip_special_tokens = True )
self . assertTrue ( special_token not in decoded )
2019-07-16 21:11:29 +08:00
2020-05-19 22:46:55 +08:00
def test_internal_consistency ( self ) :
2020-06-16 05:12:51 +08:00
tokenizers = self . get_tokenizers ( )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
input_text , output_text = self . get_input_output_texts ( tokenizer )
2019-07-16 21:11:29 +08:00
2020-06-16 05:12:51 +08:00
tokens = tokenizer . tokenize ( input_text )
ids = tokenizer . convert_tokens_to_ids ( tokens )
ids_2 = tokenizer . encode ( input_text , add_special_tokens = False )
self . assertListEqual ( ids , ids_2 )
2019-07-16 21:11:29 +08:00
2020-06-16 05:12:51 +08:00
tokens_2 = tokenizer . convert_ids_to_tokens ( ids )
self . assertNotEqual ( len ( tokens_2 ) , 0 )
text_2 = tokenizer . decode ( ids )
self . assertIsInstance ( text_2 , str )
2019-08-05 20:08:56 +08:00
2020-06-16 05:12:51 +08:00
self . assertEqual ( text_2 , output_text )
2019-08-05 20:08:56 +08:00
2020-10-19 02:51:24 +08:00
@require_tokenizers
2019-12-22 22:34:15 +08:00
def test_encode_decode_with_spaces ( self ) :
2020-06-16 05:12:51 +08:00
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
2019-12-12 01:36:37 +08:00
2021-12-03 15:39:10 +08:00
new_toks = [
AddedToken ( " [ABC] " , normalized = False ) ,
AddedToken ( " [DEF] " , normalized = False ) ,
AddedToken ( " GHI IHG " , normalized = False ) ,
]
2020-06-16 05:12:51 +08:00
tokenizer . add_tokens ( new_toks )
2021-12-03 15:39:10 +08:00
input = " [ABC][DEF][ABC]GHI IHG[DEF] "
2020-10-08 17:32:16 +08:00
if self . space_between_special_tokens :
2021-12-03 15:39:10 +08:00
output = " [ABC] [DEF] [ABC] GHI IHG [DEF] "
2020-10-08 17:32:16 +08:00
else :
output = input
2020-06-16 05:12:51 +08:00
encoded = tokenizer . encode ( input , add_special_tokens = False )
2020-10-08 17:32:16 +08:00
decoded = tokenizer . decode ( encoded , spaces_between_special_tokens = self . space_between_special_tokens )
self . assertIn ( decoded , [ output , output . lower ( ) ] )
2019-08-05 20:08:56 +08:00
2019-12-22 22:34:15 +08:00
def test_pretrained_model_lists ( self ) :
2020-10-19 02:51:24 +08:00
# We should have at least one default checkpoint for each tokenizer
# We should specify the max input length as well (used in some part to list the pretrained checkpoints)
self . assertGreaterEqual ( len ( self . tokenizer_class . pretrained_vocab_files_map ) , 1 )
self . assertGreaterEqual ( len ( list ( self . tokenizer_class . pretrained_vocab_files_map . values ( ) ) [ 0 ] ) , 1 )
self . assertEqual (
len ( list ( self . tokenizer_class . pretrained_vocab_files_map . values ( ) ) [ 0 ] ) ,
len ( self . tokenizer_class . max_model_input_sizes ) ,
)
2019-12-22 22:34:15 +08:00
weights_list = list ( self . tokenizer_class . max_model_input_sizes . keys ( ) )
weights_lists_2 = [ ]
for file_id , map_list in self . tokenizer_class . pretrained_vocab_files_map . items ( ) :
weights_lists_2 . append ( list ( map_list . keys ( ) ) )
2019-08-05 20:08:56 +08:00
2019-12-22 22:34:15 +08:00
for weights_list_2 in weights_lists_2 :
self . assertListEqual ( weights_list , weights_list_2 )
2019-09-03 05:47:16 +08:00
2019-12-22 22:34:15 +08:00
def test_mask_output ( self ) :
2021-06-14 17:58:44 +08:00
tokenizers = self . get_tokenizers ( do_lower_case = False )
2020-06-16 05:12:51 +08:00
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
if (
tokenizer . build_inputs_with_special_tokens . __qualname__ . split ( " . " ) [ 0 ] != " PreTrainedTokenizer "
and " token_type_ids " in tokenizer . model_input_names
) :
seq_0 = " Test this method. "
seq_1 = " With these inputs. "
information = tokenizer . encode_plus ( seq_0 , seq_1 , add_special_tokens = True )
sequences , mask = information [ " input_ids " ] , information [ " token_type_ids " ]
self . assertEqual ( len ( sequences ) , len ( mask ) )
2019-12-22 22:34:15 +08:00
2020-11-16 05:50:59 +08:00
def test_token_type_ids ( self ) :
tokenizers = self . get_tokenizers ( )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
seq_0 = " Test this method. "
# We want to have sequence 0 and sequence 1 are tagged
# respectively with 0 and 1 token_ids
2020-12-16 06:08:49 +08:00
# (regardless of whether the model use token type ids)
2020-11-16 05:50:59 +08:00
# We use this assumption in the QA pipeline among other place
output = tokenizer ( seq_0 , return_token_type_ids = True )
self . assertIn ( 0 , output [ " token_type_ids " ] )
def test_sequence_ids ( self ) :
tokenizers = self . get_tokenizers ( )
for tokenizer in tokenizers :
if not tokenizer . is_fast :
continue
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
seq_0 = " Test this method. "
seq_1 = " With these inputs. "
# We want to have sequence 0 and sequence 1 are tagged
# respectively with 0 and 1 token_ids
2020-12-16 06:08:49 +08:00
# (regardless of whether the model use token type ids)
2020-11-16 05:50:59 +08:00
# We use this assumption in the QA pipeline among other place
output = tokenizer ( seq_0 )
self . assertIn ( 0 , output . sequence_ids ( ) )
output = tokenizer ( seq_0 , seq_1 )
self . assertIn ( 0 , output . sequence_ids ( ) )
self . assertIn ( 1 , output . sequence_ids ( ) )
if tokenizer . num_special_tokens_to_add ( pair = True ) :
self . assertIn ( None , output . sequence_ids ( ) )
2019-12-22 22:34:15 +08:00
def test_number_of_added_tokens ( self ) :
2020-06-16 05:12:51 +08:00
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
2019-12-22 22:34:15 +08:00
2020-06-16 05:12:51 +08:00
seq_0 = " Test this method. "
seq_1 = " With these inputs. "
2019-12-22 22:34:15 +08:00
2020-06-16 05:12:51 +08:00
sequences = tokenizer . encode ( seq_0 , seq_1 , add_special_tokens = False )
2020-06-23 19:36:57 +08:00
attached_sequences = tokenizer . encode ( seq_0 , seq_1 , add_special_tokens = True )
2019-12-22 22:34:15 +08:00
2020-06-16 05:12:51 +08:00
# Method is implemented (e.g. not GPT-2)
if len ( attached_sequences ) != 2 :
self . assertEqual (
tokenizer . num_special_tokens_to_add ( pair = True ) , len ( attached_sequences ) - len ( sequences )
)
2019-12-22 22:34:15 +08:00
def test_maximum_encoding_length_single_input ( self ) :
2020-06-25 23:24:28 +08:00
tokenizers = self . get_tokenizers ( do_lower_case = False , model_max_length = 100 )
2020-06-16 05:12:51 +08:00
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
2020-06-25 23:24:28 +08:00
seq_0 , ids = self . get_clean_sequence ( tokenizer , max_length = 20 )
2020-06-16 05:12:51 +08:00
sequence = tokenizer . encode ( seq_0 , add_special_tokens = False )
total_length = len ( sequence )
2020-06-25 23:24:28 +08:00
2022-01-06 22:45:41 +08:00
self . assertGreater ( total_length , 4 , " Issue with the testing sequence, please update it it ' s too short " )
2020-06-25 23:24:28 +08:00
# Test with max model input length
model_max_length = tokenizer . model_max_length
self . assertEqual ( model_max_length , 100 )
seq_1 = seq_0 * model_max_length
sequence1 = tokenizer ( seq_1 , add_special_tokens = False )
total_length1 = len ( sequence1 [ " input_ids " ] )
2022-01-06 22:45:41 +08:00
self . assertGreater (
total_length1 , model_max_length , " Issue with the testing sequence, please update it it ' s too short "
)
2020-06-25 23:24:28 +08:00
# Simple
padding_strategies = (
[ False , True , " longest " ] if tokenizer . pad_token and tokenizer . pad_token_id > = 0 else [ False ]
)
for padding_state in padding_strategies :
with self . subTest ( f " Padding: { padding_state } " ) :
for truncation_state in [ True , " longest_first " , " only_first " ] :
with self . subTest ( f " Truncation: { truncation_state } " ) :
output = tokenizer ( seq_1 , padding = padding_state , truncation = truncation_state )
self . assertEqual ( len ( output [ " input_ids " ] ) , model_max_length )
output = tokenizer ( [ seq_1 ] , padding = padding_state , truncation = truncation_state )
self . assertEqual ( len ( output [ " input_ids " ] [ 0 ] ) , model_max_length )
# Simple with no truncation
2020-12-02 23:18:28 +08:00
# Reset warnings
tokenizer . deprecation_warnings = { }
with self . assertLogs ( " transformers " , level = " WARNING " ) as cm :
output = tokenizer ( seq_1 , padding = padding_state , truncation = False )
self . assertNotEqual ( len ( output [ " input_ids " ] ) , model_max_length )
self . assertEqual ( len ( cm . records ) , 1 )
self . assertTrue (
cm . records [ 0 ] . message . startswith (
" Token indices sequence length is longer than the specified maximum sequence length for this model "
)
)
tokenizer . deprecation_warnings = { }
with self . assertLogs ( " transformers " , level = " WARNING " ) as cm :
output = tokenizer ( [ seq_1 ] , padding = padding_state , truncation = False )
self . assertNotEqual ( len ( output [ " input_ids " ] [ 0 ] ) , model_max_length )
self . assertEqual ( len ( cm . records ) , 1 )
self . assertTrue (
cm . records [ 0 ] . message . startswith (
" Token indices sequence length is longer than the specified maximum sequence length for this model "
)
)
2020-06-25 23:24:28 +08:00
# Overflowing tokens
stride = 2
information = tokenizer (
2020-06-16 05:12:51 +08:00
seq_0 ,
max_length = total_length - 2 ,
add_special_tokens = False ,
stride = stride ,
truncation = " longest_first " ,
return_overflowing_tokens = True ,
2020-06-23 19:36:57 +08:00
# add_prefix_space=False,
2020-06-16 05:12:51 +08:00
)
# Overflowing tokens are handled quite differently in slow and fast tokenizers
if isinstance ( tokenizer , PreTrainedTokenizerFast ) :
truncated_sequence = information [ " input_ids " ] [ 0 ]
overflowing_tokens = information [ " input_ids " ] [ 1 ]
self . assertEqual ( len ( information [ " input_ids " ] ) , 2 )
self . assertEqual ( len ( truncated_sequence ) , total_length - 2 )
self . assertEqual ( truncated_sequence , sequence [ : - 2 ] )
self . assertEqual ( len ( overflowing_tokens ) , 2 + stride )
self . assertEqual ( overflowing_tokens , sequence [ - ( 2 + stride ) : ] )
else :
truncated_sequence = information [ " input_ids " ]
overflowing_tokens = information [ " overflowing_tokens " ]
2019-12-22 22:34:15 +08:00
2020-06-16 05:12:51 +08:00
self . assertEqual ( len ( truncated_sequence ) , total_length - 2 )
self . assertEqual ( truncated_sequence , sequence [ : - 2 ] )
2019-12-22 22:34:15 +08:00
2020-07-03 22:51:21 +08:00
self . assertEqual ( len ( overflowing_tokens ) , 2 + stride )
2021-09-02 17:58:23 +08:00
self . assertEqual ( overflowing_tokens , sequence [ - ( 2 + stride ) : ] )
2020-02-14 02:29:43 +08:00
2020-06-16 05:12:51 +08:00
def test_maximum_encoding_length_pair_input ( self ) :
2020-06-25 23:24:28 +08:00
tokenizers = self . get_tokenizers ( do_lower_case = False , model_max_length = 100 )
2020-06-16 05:12:51 +08:00
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
# Build a sequence from our model's vocabulary
stride = 2
2020-06-25 23:24:28 +08:00
seq_0 , ids = self . get_clean_sequence ( tokenizer , max_length = 20 )
2020-06-16 05:12:51 +08:00
if len ( ids ) < = 2 + stride :
2020-06-25 23:24:28 +08:00
seq_0 = ( seq_0 + " " ) * ( 2 + stride )
ids = None
2020-06-16 05:12:51 +08:00
seq0_tokens = tokenizer . encode ( seq_0 , add_special_tokens = False )
2022-01-06 22:45:41 +08:00
self . assertGreater ( len ( seq0_tokens ) , 2 + stride )
2020-06-16 05:12:51 +08:00
seq_1 = " This is another sentence to be encoded. "
seq1_tokens = tokenizer . encode ( seq_1 , add_special_tokens = False )
2020-06-25 23:24:28 +08:00
if abs ( len ( seq0_tokens ) - len ( seq1_tokens ) ) < = 2 :
2020-06-16 05:12:51 +08:00
seq1_tokens = seq1_tokens + seq1_tokens
seq_1 = tokenizer . decode ( seq1_tokens , clean_up_tokenization_spaces = False )
seq1_tokens = tokenizer . encode ( seq_1 , add_special_tokens = False )
2022-01-06 22:45:41 +08:00
self . assertGreater ( len ( seq1_tokens ) , 2 + stride )
2020-06-16 05:12:51 +08:00
smallest = seq1_tokens if len ( seq0_tokens ) > len ( seq1_tokens ) else seq0_tokens
# We are not using the special tokens - a bit too hard to test all the tokenizers with this
# TODO try this again later
2020-06-23 19:36:57 +08:00
sequence = tokenizer . encode ( seq_0 , seq_1 , add_special_tokens = False ) # , add_prefix_space=False)
2020-06-25 23:24:28 +08:00
# Test with max model input length
model_max_length = tokenizer . model_max_length
self . assertEqual ( model_max_length , 100 )
seq_2 = seq_0 * model_max_length
2022-01-06 22:45:41 +08:00
self . assertGreater ( len ( seq_2 ) , model_max_length )
2020-06-25 23:24:28 +08:00
sequence1 = tokenizer ( seq_1 , add_special_tokens = False )
total_length1 = len ( sequence1 [ " input_ids " ] )
sequence2 = tokenizer ( seq_2 , seq_1 , add_special_tokens = False )
total_length2 = len ( sequence2 [ " input_ids " ] )
2022-01-06 22:45:41 +08:00
self . assertLess (
total_length1 , model_max_length - 10 , " Issue with the testing sequence, please update it. "
)
self . assertGreater (
total_length2 , model_max_length , " Issue with the testing sequence, please update it. "
)
2020-06-25 23:24:28 +08:00
# Simple
padding_strategies = (
[ False , True , " longest " ] if tokenizer . pad_token and tokenizer . pad_token_id > = 0 else [ False ]
)
for padding_state in padding_strategies :
2020-10-08 17:32:16 +08:00
with self . subTest ( f " { tokenizer . __class__ . __name__ } Padding: { padding_state } " ) :
2020-06-25 23:24:28 +08:00
for truncation_state in [ True , " longest_first " , " only_first " ] :
2020-10-08 17:32:16 +08:00
with self . subTest ( f " { tokenizer . __class__ . __name__ } Truncation: { truncation_state } " ) :
2020-06-25 23:24:28 +08:00
output = tokenizer ( seq_2 , seq_1 , padding = padding_state , truncation = truncation_state )
self . assertEqual ( len ( output [ " input_ids " ] ) , model_max_length )
output = tokenizer (
[ seq_2 ] , [ seq_1 ] , padding = padding_state , truncation = truncation_state
)
self . assertEqual ( len ( output [ " input_ids " ] [ 0 ] ) , model_max_length )
# Simple
output = tokenizer ( seq_1 , seq_2 , padding = padding_state , truncation = " only_second " )
self . assertEqual ( len ( output [ " input_ids " ] ) , model_max_length )
output = tokenizer ( [ seq_1 ] , [ seq_2 ] , padding = padding_state , truncation = " only_second " )
self . assertEqual ( len ( output [ " input_ids " ] [ 0 ] ) , model_max_length )
# Simple with no truncation
2020-12-02 23:18:28 +08:00
# Reset warnings
tokenizer . deprecation_warnings = { }
with self . assertLogs ( " transformers " , level = " WARNING " ) as cm :
output = tokenizer ( seq_1 , seq_2 , padding = padding_state , truncation = False )
self . assertNotEqual ( len ( output [ " input_ids " ] ) , model_max_length )
self . assertEqual ( len ( cm . records ) , 1 )
self . assertTrue (
cm . records [ 0 ] . message . startswith (
" Token indices sequence length is longer than the specified maximum sequence length for this model "
)
)
tokenizer . deprecation_warnings = { }
with self . assertLogs ( " transformers " , level = " WARNING " ) as cm :
output = tokenizer ( [ seq_1 ] , [ seq_2 ] , padding = padding_state , truncation = False )
self . assertNotEqual ( len ( output [ " input_ids " ] [ 0 ] ) , model_max_length )
self . assertEqual ( len ( cm . records ) , 1 )
self . assertTrue (
cm . records [ 0 ] . message . startswith (
" Token indices sequence length is longer than the specified maximum sequence length for this model "
)
)
2020-06-25 23:24:28 +08:00
2020-06-16 05:12:51 +08:00
truncated_first_sequence = tokenizer . encode ( seq_0 , add_special_tokens = False ) [ : - 2 ] + tokenizer . encode (
seq_1 , add_special_tokens = False
)
truncated_second_sequence = (
tokenizer . encode ( seq_0 , add_special_tokens = False )
+ tokenizer . encode ( seq_1 , add_special_tokens = False ) [ : - 2 ]
)
truncated_longest_sequence = (
truncated_first_sequence if len ( seq0_tokens ) > len ( seq1_tokens ) else truncated_second_sequence
)
overflow_first_sequence = tokenizer . encode ( seq_0 , add_special_tokens = False ) [
- ( 2 + stride ) :
] + tokenizer . encode ( seq_1 , add_special_tokens = False )
overflow_second_sequence = (
tokenizer . encode ( seq_0 , add_special_tokens = False )
+ tokenizer . encode ( seq_1 , add_special_tokens = False ) [ - ( 2 + stride ) : ]
)
overflow_longest_sequence = (
overflow_first_sequence if len ( seq0_tokens ) > len ( seq1_tokens ) else overflow_second_sequence
)
# Overflowing tokens are handled quite differently in slow and fast tokenizers
if isinstance ( tokenizer , PreTrainedTokenizerFast ) :
2021-09-02 17:58:23 +08:00
information = tokenizer (
seq_0 ,
seq_1 ,
max_length = len ( sequence ) - 2 ,
add_special_tokens = False ,
stride = stride ,
truncation = " longest_first " ,
return_overflowing_tokens = True ,
# add_prefix_space=False,
)
2020-06-16 05:12:51 +08:00
truncated_sequence = information [ " input_ids " ] [ 0 ]
overflowing_tokens = information [ " input_ids " ] [ 1 ]
self . assertEqual ( len ( information [ " input_ids " ] ) , 2 )
self . assertEqual ( len ( truncated_sequence ) , len ( sequence ) - 2 )
self . assertEqual ( truncated_sequence , truncated_longest_sequence )
self . assertEqual ( len ( overflowing_tokens ) , 2 + stride + len ( smallest ) )
self . assertEqual ( overflowing_tokens , overflow_longest_sequence )
else :
2021-09-02 17:58:23 +08:00
# No overflowing tokens when using 'longest' in python tokenizers
with self . assertRaises ( ValueError ) as context :
information = tokenizer (
seq_0 ,
seq_1 ,
max_length = len ( sequence ) - 2 ,
add_special_tokens = False ,
stride = stride ,
truncation = " longest_first " ,
return_overflowing_tokens = True ,
# add_prefix_space=False,
)
2020-06-16 05:12:51 +08:00
2021-09-02 17:58:23 +08:00
self . assertTrue (
context . exception . args [ 0 ] . startswith (
" Not possible to return overflowing tokens for pair of sequences with the "
" `longest_first`. Please select another truncation strategy than `longest_first`, "
" for instance `only_second` or `only_first`. "
)
)
2020-06-16 05:12:51 +08:00
# Overflowing tokens are handled quite differently in slow and fast tokenizers
2020-07-03 22:51:21 +08:00
if isinstance ( tokenizer , PreTrainedTokenizerFast ) :
2021-09-02 17:58:23 +08:00
information = tokenizer (
seq_0 ,
seq_1 ,
max_length = len ( sequence ) - 2 ,
add_special_tokens = False ,
stride = stride ,
truncation = True ,
return_overflowing_tokens = True ,
# add_prefix_space=False,
)
2020-07-03 22:51:21 +08:00
truncated_sequence = information [ " input_ids " ] [ 0 ]
overflowing_tokens = information [ " input_ids " ] [ 1 ]
self . assertEqual ( len ( information [ " input_ids " ] ) , 2 )
self . assertEqual ( len ( truncated_sequence ) , len ( sequence ) - 2 )
self . assertEqual ( truncated_sequence , truncated_longest_sequence )
self . assertEqual ( len ( overflowing_tokens ) , 2 + stride + len ( smallest ) )
self . assertEqual ( overflowing_tokens , overflow_longest_sequence )
else :
2021-09-02 17:58:23 +08:00
# No overflowing tokens when using 'longest' in python tokenizers
with self . assertRaises ( ValueError ) as context :
information = tokenizer (
seq_0 ,
seq_1 ,
max_length = len ( sequence ) - 2 ,
add_special_tokens = False ,
stride = stride ,
truncation = True ,
return_overflowing_tokens = True ,
# add_prefix_space=False,
)
2020-07-03 22:51:21 +08:00
2021-09-02 17:58:23 +08:00
self . assertTrue (
context . exception . args [ 0 ] . startswith (
" Not possible to return overflowing tokens for pair of sequences with the "
" `longest_first`. Please select another truncation strategy than `longest_first`, "
" for instance `only_second` or `only_first`. "
)
)
2020-07-03 22:51:21 +08:00
2021-09-02 17:58:23 +08:00
information_first_truncated = tokenizer (
2020-07-03 22:51:21 +08:00
seq_0 ,
seq_1 ,
max_length = len ( sequence ) - 2 ,
add_special_tokens = False ,
stride = stride ,
truncation = " only_first " ,
return_overflowing_tokens = True ,
# add_prefix_space=False,
)
# Overflowing tokens are handled quite differently in slow and fast tokenizers
2020-06-16 05:12:51 +08:00
if isinstance ( tokenizer , PreTrainedTokenizerFast ) :
truncated_sequence = information_first_truncated [ " input_ids " ] [ 0 ]
overflowing_tokens = information_first_truncated [ " input_ids " ] [ 1 ]
self . assertEqual ( len ( information_first_truncated [ " input_ids " ] ) , 2 )
self . assertEqual ( len ( truncated_sequence ) , len ( sequence ) - 2 )
self . assertEqual ( truncated_sequence , truncated_first_sequence )
self . assertEqual ( len ( overflowing_tokens ) , 2 + stride + len ( seq1_tokens ) )
self . assertEqual ( overflowing_tokens , overflow_first_sequence )
else :
truncated_sequence = information_first_truncated [ " input_ids " ]
overflowing_tokens = information_first_truncated [ " overflowing_tokens " ]
self . assertEqual ( len ( truncated_sequence ) , len ( sequence ) - 2 )
self . assertEqual ( truncated_sequence , truncated_first_sequence )
self . assertEqual ( len ( overflowing_tokens ) , 2 + stride )
self . assertEqual ( overflowing_tokens , seq0_tokens [ - ( 2 + stride ) : ] )
2021-09-02 17:58:23 +08:00
information_second_truncated = tokenizer (
2020-06-16 05:12:51 +08:00
seq_0 ,
seq_1 ,
max_length = len ( sequence ) - 2 ,
add_special_tokens = False ,
stride = stride ,
truncation = " only_second " ,
return_overflowing_tokens = True ,
2020-06-23 19:36:57 +08:00
# add_prefix_space=False,
2020-06-16 05:12:51 +08:00
)
# Overflowing tokens are handled quite differently in slow and fast tokenizers
if isinstance ( tokenizer , PreTrainedTokenizerFast ) :
truncated_sequence = information_second_truncated [ " input_ids " ] [ 0 ]
overflowing_tokens = information_second_truncated [ " input_ids " ] [ 1 ]
self . assertEqual ( len ( information_second_truncated [ " input_ids " ] ) , 2 )
self . assertEqual ( len ( truncated_sequence ) , len ( sequence ) - 2 )
self . assertEqual ( truncated_sequence , truncated_second_sequence )
self . assertEqual ( len ( overflowing_tokens ) , 2 + stride + len ( seq0_tokens ) )
self . assertEqual ( overflowing_tokens , overflow_second_sequence )
else :
truncated_sequence = information_second_truncated [ " input_ids " ]
overflowing_tokens = information_second_truncated [ " overflowing_tokens " ]
self . assertEqual ( len ( truncated_sequence ) , len ( sequence ) - 2 )
self . assertEqual ( truncated_sequence , truncated_second_sequence )
2020-02-14 02:29:43 +08:00
2020-06-16 05:12:51 +08:00
self . assertEqual ( len ( overflowing_tokens ) , 2 + stride )
self . assertEqual ( overflowing_tokens , seq1_tokens [ - ( 2 + stride ) : ] )
2020-02-14 02:29:43 +08:00
2020-06-16 05:12:51 +08:00
# def test_encode_input_type(self):
# tokenizers = self.get_tokenizers(do_lower_case=False)
# for tokenizer in tokenizers:
# with self.subTest(f"{tokenizer.__class__.__name__}"):
# sequence = "Let's encode this sequence"
2020-02-14 02:29:43 +08:00
2020-06-16 05:12:51 +08:00
# tokens = sequence.split() # tokenizer.tokenize(sequence)
# # input_ids = tokenizer.convert_tokens_to_ids(tokens)
# formatted_input = tokenizer.encode(sequence, add_special_tokens=True, add_prefix_space=False)
2020-02-14 02:29:43 +08:00
2020-06-16 05:12:51 +08:00
# self.assertEqual(
2020-09-22 21:34:35 +08:00
# tokenizer.encode(tokens, is_split_into_words=True, add_special_tokens=True), formatted_input
2020-06-16 05:12:51 +08:00
# )
# # This is not supported with the Rust tokenizers
# # self.assertEqual(tokenizer.encode(input_ids, add_special_tokens=True), formatted_input)
2020-02-14 02:29:43 +08:00
2020-10-08 17:32:16 +08:00
# def test_swap_special_token(self):
# tokenizers = self.get_tokenizers(do_lower_case=False)
# for tokenizer in tokenizers:
# with self.subTest(f"{tokenizer.__class__.__name__}"):
# # Our mask token
# mask = "<mask>"
# # We take a single word in the middle of the vocabulary
# all_tokens = sorted(tokenizer.get_vocab().keys())
# word = tokenizer.decode(tokenizer.encode(all_tokens[len(all_tokens)//2], add_special_tokens=False)[:1])
# sequence_0 = "Encode " + word + " sequence"
# sequence_masked_0 = "Encode " + mask + " sequence"
# sequence_1 = word + " this sequence"
# sequence_masked_1 = mask + " this sequence"
# # Add tokens so that masked token isn't split
# # tokens = [AddedToken(t, lstrip=True, normalized=False) for t in sequence.split()]
# # tokenizer.add_tokens(tokens)
# tokenizer.add_special_tokens(
# {"mask_token": AddedToken(mask, normalized=False)}
# ) # Eat left space on Byte-level BPE tokenizers
# mask_ind = tokenizer.convert_tokens_to_ids(mask)
# # Test first masked sequence
# encoded_0 = tokenizer.encode(sequence_0, add_special_tokens=False)
# encoded_masked = tokenizer.encode(sequence_masked_0, add_special_tokens=False)
2022-01-06 22:45:41 +08:00
# self.assertEqual(len(encoded_masked), len(encoded_0))
2020-10-08 17:32:16 +08:00
# mask_loc = encoded_masked.index(mask_ind)
# encoded_masked[mask_loc] = encoded_0[mask_loc]
# self.assertEqual(encoded_masked, encoded_0)
# # Test second masked sequence
# encoded_1 = tokenizer.encode(sequence_1, add_special_tokens=False)
# encoded_masked = tokenizer.encode(sequence_masked_1, add_special_tokens=False)
2022-01-06 22:45:41 +08:00
# self.assertEqual(len(encoded_masked), len(encoded_1))
2020-10-08 17:32:16 +08:00
# mask_loc = encoded_masked.index(mask_ind)
# encoded_masked[mask_loc] = encoded_1[mask_loc]
# self.assertEqual(encoded_masked, encoded_1)
2020-02-14 02:29:43 +08:00
2019-12-22 22:34:15 +08:00
def test_special_tokens_mask ( self ) :
2020-06-16 05:12:51 +08:00
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
sequence_0 = " Encode this. "
# Testing single inputs
encoded_sequence = tokenizer . encode ( sequence_0 , add_special_tokens = False )
encoded_sequence_dict = tokenizer . encode_plus (
2020-06-23 19:36:57 +08:00
sequence_0 , add_special_tokens = True , return_special_tokens_mask = True # , add_prefix_space=False
2020-06-16 05:12:51 +08:00
)
encoded_sequence_w_special = encoded_sequence_dict [ " input_ids " ]
special_tokens_mask = encoded_sequence_dict [ " special_tokens_mask " ]
self . assertEqual ( len ( special_tokens_mask ) , len ( encoded_sequence_w_special ) )
filtered_sequence = [ x for i , x in enumerate ( encoded_sequence_w_special ) if not special_tokens_mask [ i ] ]
self . assertEqual ( encoded_sequence , filtered_sequence )
2019-12-22 22:34:15 +08:00
2020-05-19 22:46:55 +08:00
def test_special_tokens_mask_input_pairs ( self ) :
2020-06-16 05:12:51 +08:00
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
sequence_0 = " Encode this. "
sequence_1 = " This one too please. "
encoded_sequence = tokenizer . encode ( sequence_0 , add_special_tokens = False )
encoded_sequence + = tokenizer . encode ( sequence_1 , add_special_tokens = False )
encoded_sequence_dict = tokenizer . encode_plus (
sequence_0 ,
sequence_1 ,
add_special_tokens = True ,
return_special_tokens_mask = True ,
2020-06-23 19:36:57 +08:00
# add_prefix_space=False,
2020-06-16 05:12:51 +08:00
)
encoded_sequence_w_special = encoded_sequence_dict [ " input_ids " ]
special_tokens_mask = encoded_sequence_dict [ " special_tokens_mask " ]
self . assertEqual ( len ( special_tokens_mask ) , len ( encoded_sequence_w_special ) )
filtered_sequence = [
( x if not special_tokens_mask [ i ] else None ) for i , x in enumerate ( encoded_sequence_w_special )
]
filtered_sequence = [ x for x in filtered_sequence if x is not None ]
self . assertEqual ( encoded_sequence , filtered_sequence )
2019-12-22 22:34:15 +08:00
2022-02-01 23:13:58 +08:00
def test_padding_side_in_kwargs ( self ) :
for tokenizer , pretrained_name , kwargs in self . tokenizers_list :
with self . subTest ( f " { tokenizer . __class__ . __name__ } ( { pretrained_name } ) " ) :
if self . test_rust_tokenizer :
tokenizer_r = self . rust_tokenizer_class . from_pretrained (
pretrained_name , padding_side = " left " , * * kwargs
)
self . assertEqual ( tokenizer_r . padding_side , " left " )
tokenizer_r = self . rust_tokenizer_class . from_pretrained (
pretrained_name , padding_side = " right " , * * kwargs
)
self . assertEqual ( tokenizer_r . padding_side , " right " )
self . assertRaises (
ValueError ,
self . rust_tokenizer_class . from_pretrained ,
pretrained_name ,
padding_side = " unauthorized " ,
* * kwargs ,
)
if self . test_slow_tokenizer :
tokenizer_p = self . tokenizer_class . from_pretrained ( pretrained_name , padding_side = " left " , * * kwargs )
self . assertEqual ( tokenizer_p . padding_side , " left " )
tokenizer_p = self . tokenizer_class . from_pretrained ( pretrained_name , padding_side = " right " , * * kwargs )
self . assertEqual ( tokenizer_p . padding_side , " right " )
self . assertRaises (
ValueError ,
self . tokenizer_class . from_pretrained ,
pretrained_name ,
padding_side = " unauthorized " ,
* * kwargs ,
)
2022-02-03 06:18:09 +08:00
def test_truncation_side_in_kwargs ( self ) :
for tokenizer , pretrained_name , kwargs in self . tokenizers_list :
with self . subTest ( f " { tokenizer . __class__ . __name__ } ( { pretrained_name } ) " ) :
if self . test_rust_tokenizer :
tokenizer_r = self . rust_tokenizer_class . from_pretrained (
pretrained_name , truncation_side = " left " , * * kwargs
)
self . assertEqual ( tokenizer_r . truncation_side , " left " )
tokenizer_r = self . rust_tokenizer_class . from_pretrained (
pretrained_name , truncation_side = " right " , * * kwargs
)
self . assertEqual ( tokenizer_r . truncation_side , " right " )
self . assertRaises (
ValueError ,
self . rust_tokenizer_class . from_pretrained ,
pretrained_name ,
truncation_side = " unauthorized " ,
* * kwargs ,
)
if self . test_slow_tokenizer :
tokenizer_p = self . tokenizer_class . from_pretrained (
pretrained_name , truncation_side = " left " , * * kwargs
)
self . assertEqual ( tokenizer_p . truncation_side , " left " )
tokenizer_p = self . tokenizer_class . from_pretrained (
pretrained_name , truncation_side = " right " , * * kwargs
)
self . assertEqual ( tokenizer_p . truncation_side , " right " )
self . assertRaises (
ValueError ,
self . tokenizer_class . from_pretrained ,
pretrained_name ,
truncation_side = " unauthorized " ,
* * kwargs ,
)
2020-06-16 05:12:51 +08:00
def test_right_and_left_padding ( self ) :
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
sequence = " Sequence "
padding_size = 10
# check correct behaviour if no pad_token_id exists and add it eventually
self . _check_no_pad_token_padding ( tokenizer , sequence )
padding_idx = tokenizer . pad_token_id
# RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
tokenizer . padding_side = " right "
encoded_sequence = tokenizer . encode ( sequence )
sequence_length = len ( encoded_sequence )
padded_sequence = tokenizer . encode (
sequence , max_length = sequence_length + padding_size , padding = " max_length "
)
padded_sequence_length = len ( padded_sequence )
2022-01-06 22:45:41 +08:00
self . assertEqual ( sequence_length + padding_size , padded_sequence_length )
self . assertEqual ( encoded_sequence + [ padding_idx ] * padding_size , padded_sequence )
2020-06-16 05:12:51 +08:00
# LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
tokenizer . padding_side = " left "
encoded_sequence = tokenizer . encode ( sequence )
sequence_length = len ( encoded_sequence )
padded_sequence = tokenizer . encode (
sequence , max_length = sequence_length + padding_size , padding = " max_length "
)
padded_sequence_length = len ( padded_sequence )
2022-01-06 22:45:41 +08:00
self . assertEqual ( sequence_length + padding_size , padded_sequence_length )
self . assertEqual ( [ padding_idx ] * padding_size + encoded_sequence , padded_sequence )
2020-06-16 05:12:51 +08:00
# RIGHT & LEFT PADDING - Check that nothing is done for 'longest' and 'no_padding'
encoded_sequence = tokenizer . encode ( sequence )
sequence_length = len ( encoded_sequence )
tokenizer . padding_side = " right "
padded_sequence_right = tokenizer . encode ( sequence , padding = True )
padded_sequence_right_length = len ( padded_sequence_right )
2022-01-06 22:45:41 +08:00
self . assertEqual ( sequence_length , padded_sequence_right_length )
self . assertEqual ( encoded_sequence , padded_sequence_right )
2020-06-16 05:12:51 +08:00
tokenizer . padding_side = " left "
padded_sequence_left = tokenizer . encode ( sequence , padding = " longest " )
padded_sequence_left_length = len ( padded_sequence_left )
2022-01-06 22:45:41 +08:00
self . assertEqual ( sequence_length , padded_sequence_left_length )
self . assertEqual ( encoded_sequence , padded_sequence_left )
2020-06-16 05:12:51 +08:00
tokenizer . padding_side = " right "
padded_sequence_right = tokenizer . encode ( sequence )
padded_sequence_right_length = len ( padded_sequence_right )
2022-01-06 22:45:41 +08:00
self . assertEqual ( sequence_length , padded_sequence_right_length )
self . assertEqual ( encoded_sequence , padded_sequence_right )
2020-06-16 05:12:51 +08:00
tokenizer . padding_side = " left "
padded_sequence_left = tokenizer . encode ( sequence , padding = False )
padded_sequence_left_length = len ( padded_sequence_left )
2022-01-06 22:45:41 +08:00
self . assertEqual ( sequence_length , padded_sequence_left_length )
self . assertEqual ( encoded_sequence , padded_sequence_left )
2019-12-22 22:34:15 +08:00
2022-01-03 23:18:39 +08:00
def test_right_and_left_truncation ( self ) :
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
sequence = " This is a test sequence "
# RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
truncation_size = 3
tokenizer . truncation_side = " right "
encoded_sequence = tokenizer . encode ( sequence , add_special_tokens = False )
sequence_length = len ( encoded_sequence )
# Remove EOS/BOS tokens
truncated_sequence = tokenizer . encode (
sequence , max_length = sequence_length - truncation_size , truncation = True , add_special_tokens = False
)
truncated_sequence_length = len ( truncated_sequence )
self . assertEqual ( sequence_length , truncated_sequence_length + truncation_size )
self . assertEqual ( encoded_sequence [ : - truncation_size ] , truncated_sequence )
# LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the truncation flag set to True
tokenizer . truncation_side = " left "
sequence_length = len ( encoded_sequence )
truncated_sequence = tokenizer . encode (
sequence , max_length = sequence_length - truncation_size , truncation = True , add_special_tokens = False
)
truncated_sequence_length = len ( truncated_sequence )
self . assertEqual ( sequence_length , truncated_sequence_length + truncation_size )
self . assertEqual ( encoded_sequence [ truncation_size : ] , truncated_sequence )
# RIGHT & LEFT PADDING - Check that nothing is done for 'longest' and 'no_truncation'
sequence_length = len ( encoded_sequence )
tokenizer . truncation_side = " right "
truncated_sequence_right = tokenizer . encode ( sequence , truncation = True , add_special_tokens = False )
truncated_sequence_right_length = len ( truncated_sequence_right )
self . assertEqual ( sequence_length , truncated_sequence_right_length )
self . assertEqual ( encoded_sequence , truncated_sequence_right )
tokenizer . truncation_side = " left "
truncated_sequence_left = tokenizer . encode (
sequence , truncation = " longest_first " , add_special_tokens = False
)
truncated_sequence_left_length = len ( truncated_sequence_left )
self . assertEqual ( sequence_length , truncated_sequence_left_length )
self . assertEqual ( encoded_sequence , truncated_sequence_left )
tokenizer . truncation_side = " right "
truncated_sequence_right = tokenizer . encode ( sequence , add_special_tokens = False )
truncated_sequence_right_length = len ( truncated_sequence_right )
self . assertEqual ( sequence_length , truncated_sequence_right_length )
self . assertEqual ( encoded_sequence , truncated_sequence_right )
tokenizer . truncation_side = " left "
truncated_sequence_left = tokenizer . encode ( sequence , truncation = False , add_special_tokens = False )
truncated_sequence_left_length = len ( truncated_sequence_left )
self . assertEqual ( sequence_length , truncated_sequence_left_length )
self . assertEqual ( encoded_sequence , truncated_sequence_left )
2019-12-22 22:34:15 +08:00
def test_padding_to_max_length ( self ) :
2021-11-11 04:34:52 +08:00
""" We keep this test for backward compatibility but it should be remove when `pad_to_max_length` is deprecated. """
2020-06-16 05:12:51 +08:00
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
sequence = " Sequence "
padding_size = 10
# check correct behaviour if no pad_token_id exists and add it eventually
self . _check_no_pad_token_padding ( tokenizer , sequence )
padding_idx = tokenizer . pad_token_id
# Check that it correctly pads when a maximum length is specified along with the padding flag set to True
tokenizer . padding_side = " right "
encoded_sequence = tokenizer . encode ( sequence )
sequence_length = len ( encoded_sequence )
2020-07-08 01:23:01 +08:00
# FIXME: the next line should be padding(max_length) to avoid warning
2020-06-16 05:12:51 +08:00
padded_sequence = tokenizer . encode (
sequence , max_length = sequence_length + padding_size , pad_to_max_length = True
)
padded_sequence_length = len ( padded_sequence )
2022-01-06 22:45:41 +08:00
self . assertEqual ( sequence_length + padding_size , padded_sequence_length )
self . assertEqual ( encoded_sequence + [ padding_idx ] * padding_size , padded_sequence )
2020-06-16 05:12:51 +08:00
# Check that nothing is done when a maximum length is not specified
encoded_sequence = tokenizer . encode ( sequence )
sequence_length = len ( encoded_sequence )
tokenizer . padding_side = " right "
padded_sequence_right = tokenizer . encode ( sequence , pad_to_max_length = True )
padded_sequence_right_length = len ( padded_sequence_right )
2022-01-06 22:45:41 +08:00
self . assertEqual ( sequence_length , padded_sequence_right_length )
self . assertEqual ( encoded_sequence , padded_sequence_right )
2019-12-22 22:34:15 +08:00
2020-06-26 17:55:57 +08:00
def test_padding_to_multiple_of ( self ) :
tokenizers = self . get_tokenizers ( )
for tokenizer in tokenizers :
2020-10-08 17:32:16 +08:00
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
if tokenizer . pad_token is None :
self . skipTest ( " No padding token. " )
else :
2020-06-26 17:55:57 +08:00
empty_tokens = tokenizer ( " " , padding = True , pad_to_multiple_of = 8 )
normal_tokens = tokenizer ( " This is a sample input " , padding = True , pad_to_multiple_of = 8 )
for key , value in empty_tokens . items ( ) :
2021-03-31 22:00:27 +08:00
self . assertEqual ( len ( value ) % 8 , 0 , f " BatchEncoding. { key } is not multiple of 8 " )
2020-06-26 17:55:57 +08:00
for key , value in normal_tokens . items ( ) :
2021-03-31 22:00:27 +08:00
self . assertEqual ( len ( value ) % 8 , 0 , f " BatchEncoding. { key } is not multiple of 8 " )
2020-06-26 17:55:57 +08:00
normal_tokens = tokenizer ( " This " , pad_to_multiple_of = 8 )
for key , value in normal_tokens . items ( ) :
2021-03-31 22:00:27 +08:00
self . assertNotEqual ( len ( value ) % 8 , 0 , f " BatchEncoding. { key } is not multiple of 8 " )
2020-06-26 17:55:57 +08:00
# Should also work with truncation
normal_tokens = tokenizer ( " This " , padding = True , truncation = True , pad_to_multiple_of = 8 )
for key , value in normal_tokens . items ( ) :
2021-03-31 22:00:27 +08:00
self . assertEqual ( len ( value ) % 8 , 0 , f " BatchEncoding. { key } is not multiple of 8 " )
2020-06-26 17:55:57 +08:00
# truncation to something which is not a multiple of pad_to_multiple_of raises an error
self . assertRaises (
ValueError ,
tokenizer . __call__ ,
" This " ,
padding = True ,
truncation = True ,
max_length = 12 ,
pad_to_multiple_of = 8 ,
)
2021-10-11 21:12:09 +08:00
def test_padding_with_attention_mask ( self ) :
tokenizers = self . get_tokenizers ( )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
if tokenizer . pad_token is None :
self . skipTest ( " No padding token. " )
if " attention_mask " not in tokenizer . model_input_names :
self . skipTest ( " This model does not use attention mask. " )
features = [
{ " input_ids " : [ 1 , 2 , 3 , 4 , 5 , 6 ] , " attention_mask " : [ 1 , 1 , 1 , 1 , 1 , 0 ] } ,
{ " input_ids " : [ 1 , 2 , 3 ] , " attention_mask " : [ 1 , 1 , 0 ] } ,
]
padded_features = tokenizer . pad ( features )
if tokenizer . padding_side == " right " :
self . assertListEqual ( padded_features [ " attention_mask " ] , [ [ 1 , 1 , 1 , 1 , 1 , 0 ] , [ 1 , 1 , 0 , 0 , 0 , 0 ] ] )
else :
self . assertListEqual ( padded_features [ " attention_mask " ] , [ [ 1 , 1 , 1 , 1 , 1 , 0 ] , [ 0 , 0 , 0 , 1 , 1 , 0 ] ] )
2019-12-22 22:34:15 +08:00
def test_encode_plus_with_padding ( self ) :
2020-06-16 05:12:51 +08:00
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
sequence = " Sequence "
# check correct behaviour if no pad_token_id exists and add it eventually
self . _check_no_pad_token_padding ( tokenizer , sequence )
padding_size = 10
padding_idx = tokenizer . pad_token_id
token_type_padding_idx = tokenizer . pad_token_type_id
encoded_sequence = tokenizer . encode_plus ( sequence , return_special_tokens_mask = True )
input_ids = encoded_sequence [ " input_ids " ]
special_tokens_mask = encoded_sequence [ " special_tokens_mask " ]
sequence_length = len ( input_ids )
# Test 'longest' and 'no_padding' don't do anything
tokenizer . padding_side = " right "
2020-08-26 23:20:22 +08:00
not_padded_sequence = tokenizer . encode_plus (
sequence ,
padding = True ,
return_special_tokens_mask = True ,
)
2020-06-16 05:12:51 +08:00
not_padded_input_ids = not_padded_sequence [ " input_ids " ]
not_padded_special_tokens_mask = not_padded_sequence [ " special_tokens_mask " ]
not_padded_sequence_length = len ( not_padded_input_ids )
2022-01-06 22:45:41 +08:00
self . assertEqual ( sequence_length , not_padded_sequence_length )
self . assertEqual ( input_ids , not_padded_input_ids )
self . assertEqual ( special_tokens_mask , not_padded_special_tokens_mask )
2020-06-16 05:12:51 +08:00
2020-08-26 23:20:22 +08:00
not_padded_sequence = tokenizer . encode_plus (
sequence ,
padding = False ,
return_special_tokens_mask = True ,
)
2020-06-16 05:12:51 +08:00
not_padded_input_ids = not_padded_sequence [ " input_ids " ]
not_padded_special_tokens_mask = not_padded_sequence [ " special_tokens_mask " ]
not_padded_sequence_length = len ( not_padded_input_ids )
2022-01-06 22:45:41 +08:00
self . assertEqual ( sequence_length , not_padded_sequence_length )
self . assertEqual ( input_ids , not_padded_input_ids )
self . assertEqual ( special_tokens_mask , not_padded_special_tokens_mask )
2020-06-16 05:12:51 +08:00
# Test right padding
tokenizer . padding_side = " right "
right_padded_sequence = tokenizer . encode_plus (
sequence ,
max_length = sequence_length + padding_size ,
padding = " max_length " ,
return_special_tokens_mask = True ,
)
right_padded_input_ids = right_padded_sequence [ " input_ids " ]
right_padded_special_tokens_mask = right_padded_sequence [ " special_tokens_mask " ]
right_padded_sequence_length = len ( right_padded_input_ids )
2022-01-06 22:45:41 +08:00
self . assertEqual ( sequence_length + padding_size , right_padded_sequence_length )
self . assertEqual ( input_ids + [ padding_idx ] * padding_size , right_padded_input_ids )
self . assertEqual ( special_tokens_mask + [ 1 ] * padding_size , right_padded_special_tokens_mask )
2020-06-16 05:12:51 +08:00
# Test left padding
tokenizer . padding_side = " left "
left_padded_sequence = tokenizer . encode_plus (
sequence ,
max_length = sequence_length + padding_size ,
padding = " max_length " ,
return_special_tokens_mask = True ,
)
left_padded_input_ids = left_padded_sequence [ " input_ids " ]
left_padded_special_tokens_mask = left_padded_sequence [ " special_tokens_mask " ]
left_padded_sequence_length = len ( left_padded_input_ids )
2022-01-06 22:45:41 +08:00
self . assertEqual ( sequence_length + padding_size , left_padded_sequence_length )
self . assertEqual ( [ padding_idx ] * padding_size + input_ids , left_padded_input_ids )
self . assertEqual ( [ 1 ] * padding_size + special_tokens_mask , left_padded_special_tokens_mask )
2020-06-16 05:12:51 +08:00
if " token_type_ids " in tokenizer . model_input_names :
token_type_ids = encoded_sequence [ " token_type_ids " ]
left_padded_token_type_ids = left_padded_sequence [ " token_type_ids " ]
right_padded_token_type_ids = right_padded_sequence [ " token_type_ids " ]
2022-01-06 22:45:41 +08:00
self . assertEqual (
token_type_ids + [ token_type_padding_idx ] * padding_size , right_padded_token_type_ids
)
self . assertEqual (
[ token_type_padding_idx ] * padding_size + token_type_ids , left_padded_token_type_ids
)
2020-06-16 05:12:51 +08:00
if " attention_mask " in tokenizer . model_input_names :
attention_mask = encoded_sequence [ " attention_mask " ]
right_padded_attention_mask = right_padded_sequence [ " attention_mask " ]
left_padded_attention_mask = left_padded_sequence [ " attention_mask " ]
2022-01-06 22:45:41 +08:00
self . assertEqual ( attention_mask + [ 0 ] * padding_size , right_padded_attention_mask )
self . assertEqual ( [ 0 ] * padding_size + attention_mask , left_padded_attention_mask )
2020-01-30 05:15:39 +08:00
def test_separate_tokenizers ( self ) :
# This tests that tokenizers don't impact others. Unfortunately the case where it fails is when
# we're loading an S3 configuration from a pre-trained identifier, and we have no way of testing those today.
2021-06-14 17:58:44 +08:00
tokenizers = self . get_tokenizers ( random_argument = True )
new_tokenizers = self . get_tokenizers ( random_argument = False )
for tokenizer , new_tokenizer in zip ( tokenizers , new_tokenizers ) :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
2022-01-06 22:45:41 +08:00
self . assertTrue ( tokenizer . init_kwargs [ " random_argument " ] )
self . assertTrue ( tokenizer . init_kwargs [ " random_argument " ] )
self . assertFalse ( new_tokenizer . init_kwargs [ " random_argument " ] )
2020-02-21 04:25:46 +08:00
def test_get_vocab ( self ) :
2020-06-16 05:12:51 +08:00
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
2020-10-08 17:32:16 +08:00
vocab_dict = tokenizer . get_vocab ( )
self . assertIsInstance ( vocab_dict , dict )
self . assertGreaterEqual ( len ( tokenizer ) , len ( vocab_dict ) )
2020-02-21 04:25:46 +08:00
2020-10-08 17:32:16 +08:00
vocab = [ tokenizer . convert_ids_to_tokens ( i ) for i in range ( len ( tokenizer ) ) ]
2020-06-16 05:12:51 +08:00
self . assertEqual ( len ( vocab ) , len ( tokenizer ) )
2020-02-21 04:25:46 +08:00
2020-06-16 05:12:51 +08:00
tokenizer . add_tokens ( [ " asdfasdfasdfasdf " ] )
2020-10-08 17:32:16 +08:00
vocab = [ tokenizer . convert_ids_to_tokens ( i ) for i in range ( len ( tokenizer ) ) ]
2020-06-16 05:12:51 +08:00
self . assertEqual ( len ( vocab ) , len ( tokenizer ) )
2020-02-21 04:25:46 +08:00
2020-05-19 22:46:55 +08:00
def test_conversion_reversible ( self ) :
2020-06-16 05:12:51 +08:00
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
vocab = tokenizer . get_vocab ( )
for word , ind in vocab . items ( ) :
2020-10-08 17:32:16 +08:00
if word == tokenizer . unk_token :
continue
2020-06-16 05:12:51 +08:00
self . assertEqual ( tokenizer . convert_tokens_to_ids ( word ) , ind )
self . assertEqual ( tokenizer . convert_ids_to_tokens ( ind ) , word )
def test_call ( self ) :
# Tests that all call wrap to encode_plus and batch_encode_plus
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
sequences = [
" Testing batch encode plus " ,
" Testing batch encode plus with different sequence lengths " ,
" Testing batch encode plus with different sequence lengths correctly pads " ,
]
# Test not batched
encoded_sequences_1 = tokenizer . encode_plus ( sequences [ 0 ] )
encoded_sequences_2 = tokenizer ( sequences [ 0 ] )
self . assertEqual ( encoded_sequences_1 , encoded_sequences_2 )
# Test not batched pairs
encoded_sequences_1 = tokenizer . encode_plus ( sequences [ 0 ] , sequences [ 1 ] )
encoded_sequences_2 = tokenizer ( sequences [ 0 ] , sequences [ 1 ] )
self . assertEqual ( encoded_sequences_1 , encoded_sequences_2 )
# Test batched
encoded_sequences_1 = tokenizer . batch_encode_plus ( sequences )
encoded_sequences_2 = tokenizer ( sequences )
self . assertEqual ( encoded_sequences_1 , encoded_sequences_2 )
# Test batched pairs
encoded_sequences_1 = tokenizer . batch_encode_plus ( list ( zip ( sequences , sequences ) ) )
encoded_sequences_2 = tokenizer ( sequences , sequences )
self . assertEqual ( encoded_sequences_1 , encoded_sequences_2 )
2020-02-25 01:09:46 +08:00
def test_batch_encode_plus_batch_sequence_length ( self ) :
# Tests that all encoded values have the correct size
2020-06-16 05:12:51 +08:00
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
sequences = [
" Testing batch encode plus " ,
" Testing batch encode plus with different sequence lengths " ,
" Testing batch encode plus with different sequence lengths correctly pads " ,
]
encoded_sequences = [ tokenizer . encode_plus ( sequence ) for sequence in sequences ]
encoded_sequences_batch = tokenizer . batch_encode_plus ( sequences , padding = False )
self . assertListEqual (
encoded_sequences , self . convert_batch_encode_plus_format_to_encode_plus ( encoded_sequences_batch )
)
maximum_length = len (
max ( [ encoded_sequence [ " input_ids " ] for encoded_sequence in encoded_sequences ] , key = len )
)
# check correct behaviour if no pad_token_id exists and add it eventually
self . _check_no_pad_token_padding ( tokenizer , sequences )
encoded_sequences_padded = [
tokenizer . encode_plus ( sequence , max_length = maximum_length , padding = " max_length " )
for sequence in sequences
]
encoded_sequences_batch_padded = tokenizer . batch_encode_plus ( sequences , padding = True )
self . assertListEqual (
encoded_sequences_padded ,
self . convert_batch_encode_plus_format_to_encode_plus ( encoded_sequences_batch_padded ) ,
)
# check 'longest' is unsensitive to a max length
encoded_sequences_batch_padded_1 = tokenizer . batch_encode_plus ( sequences , padding = True )
encoded_sequences_batch_padded_2 = tokenizer . batch_encode_plus (
sequences , max_length = maximum_length + 10 , padding = " longest "
)
for key in encoded_sequences_batch_padded_1 . keys ( ) :
self . assertListEqual (
2020-08-26 23:20:22 +08:00
encoded_sequences_batch_padded_1 [ key ] ,
encoded_sequences_batch_padded_2 [ key ] ,
2020-06-16 05:12:51 +08:00
)
# check 'no_padding' is unsensitive to a max length
encoded_sequences_batch_padded_1 = tokenizer . batch_encode_plus ( sequences , padding = False )
encoded_sequences_batch_padded_2 = tokenizer . batch_encode_plus (
sequences , max_length = maximum_length + 10 , padding = False
)
for key in encoded_sequences_batch_padded_1 . keys ( ) :
self . assertListEqual (
2020-08-26 23:20:22 +08:00
encoded_sequences_batch_padded_1 [ key ] ,
encoded_sequences_batch_padded_2 [ key ] ,
2020-06-16 05:12:51 +08:00
)
2020-02-25 01:09:46 +08:00
2021-09-09 23:26:16 +08:00
@require_tokenizers
def test_added_token_are_matched_longest_first ( self ) :
if not self . test_slow_tokenizer :
self . skipTest ( " This test is only for slow tokenizers " )
return
tokenizers = self . get_tokenizers ( fast = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
try :
tokenizer . add_tokens ( [ AddedToken ( " extra_id_1 " ) ] )
tokenizer . add_tokens ( [ AddedToken ( " extra_id_100 " ) ] )
except Exception :
# Canine cannot add tokens which are not codepoints
self . skipTest ( " Cannot add those Added tokens " )
# XXX: This used to split on `extra_id_1` first we're matching
# longest first now.
tokens = tokenizer . tokenize ( " This is some extra_id_100 " )
self . assertIn ( " extra_id_100 " , tokens )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
tokenizer . add_tokens ( [ AddedToken ( " extra_id_100 " ) ] )
tokenizer . add_tokens ( [ AddedToken ( " extra_id_1 " ) ] )
tokens = tokenizer . tokenize ( " This is some extra_id_100 " )
self . assertIn ( " extra_id_100 " , tokens )
2020-10-19 02:51:24 +08:00
@require_tokenizers
2020-08-11 16:49:16 +08:00
def test_added_token_serializable ( self ) :
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
2020-10-08 17:32:16 +08:00
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
new_token = AddedToken ( " new_token " , lstrip = True )
tokenizer . add_special_tokens ( { " additional_special_tokens " : [ new_token ] } )
2020-08-11 16:49:16 +08:00
2020-10-08 17:32:16 +08:00
with tempfile . TemporaryDirectory ( ) as tmp_dir_name :
tokenizer . save_pretrained ( tmp_dir_name )
tokenizer . from_pretrained ( tmp_dir_name )
2020-08-11 16:49:16 +08:00
2020-02-25 01:09:46 +08:00
def test_batch_encode_plus_padding ( self ) :
# Test that padded sequences are equivalent between batch_encode_plus and encode_plus
# Right padding tests
2020-06-16 05:12:51 +08:00
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
sequences = [
" Testing batch encode plus " ,
" Testing batch encode plus with different sequence lengths " ,
" Testing batch encode plus with different sequence lengths correctly pads " ,
]
max_length = 100
# check correct behaviour if no pad_token_id exists and add it eventually
self . _check_no_pad_token_padding ( tokenizer , sequences )
encoded_sequences = [
tokenizer . encode_plus ( sequence , max_length = max_length , padding = " max_length " )
for sequence in sequences
]
encoded_sequences_batch = tokenizer . batch_encode_plus (
sequences , max_length = max_length , padding = " max_length "
)
self . assertListEqual (
encoded_sequences , self . convert_batch_encode_plus_format_to_encode_plus ( encoded_sequences_batch )
)
2020-02-25 01:09:46 +08:00
# Left padding tests
2020-06-16 05:12:51 +08:00
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
tokenizer . padding_side = " left "
sequences = [
" Testing batch encode plus " ,
" Testing batch encode plus with different sequence lengths " ,
" Testing batch encode plus with different sequence lengths correctly pads " ,
]
max_length = 100
# check correct behaviour if no pad_token_id exists and add it eventually
self . _check_no_pad_token_padding ( tokenizer , sequences )
encoded_sequences = [
tokenizer . encode_plus ( sequence , max_length = max_length , padding = " max_length " )
for sequence in sequences
]
encoded_sequences_batch = tokenizer . batch_encode_plus (
sequences , max_length = max_length , padding = " max_length "
)
self . assertListEqual (
encoded_sequences , self . convert_batch_encode_plus_format_to_encode_plus ( encoded_sequences_batch )
)
def test_pretokenized_inputs ( self ) :
# Test when inputs are pretokenized
2020-06-23 19:36:57 +08:00
tokenizers = self . get_tokenizers ( do_lower_case = False ) # , add_prefix_space=True)
2020-06-16 05:12:51 +08:00
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
2020-10-08 17:32:16 +08:00
if hasattr ( tokenizer , " add_prefix_space " ) and not tokenizer . add_prefix_space :
continue
2020-06-16 05:12:51 +08:00
# Prepare a sequence from our tokenizer vocabulary
sequence , ids = self . get_clean_sequence ( tokenizer , with_prefix_space = True , max_length = 20 )
# sequence = " " + sequence # To be sure the byte-level tokenizers are feeling good
token_sequence = sequence . split ( )
# sequence_no_prefix_space = sequence.strip()
# Test encode for pretokenized inputs
2020-09-22 21:34:35 +08:00
output = tokenizer . encode ( token_sequence , is_split_into_words = True , add_special_tokens = False )
2020-06-16 05:12:51 +08:00
output_sequence = tokenizer . encode ( sequence , add_special_tokens = False )
self . assertEqual ( output , output_sequence )
2020-09-22 21:34:35 +08:00
output = tokenizer . encode ( token_sequence , is_split_into_words = True , add_special_tokens = True )
2020-06-16 05:12:51 +08:00
output_sequence = tokenizer . encode ( sequence , add_special_tokens = True )
self . assertEqual ( output , output_sequence )
# Test encode_plus for pretokenized inputs
2020-09-22 21:34:35 +08:00
output = tokenizer . encode_plus ( token_sequence , is_split_into_words = True , add_special_tokens = False )
2020-06-16 05:12:51 +08:00
output_sequence = tokenizer . encode_plus ( sequence , add_special_tokens = False )
for key in output . keys ( ) :
self . assertEqual ( output [ key ] , output_sequence [ key ] )
2020-09-22 21:34:35 +08:00
output = tokenizer . encode_plus ( token_sequence , is_split_into_words = True , add_special_tokens = True )
2020-06-16 05:12:51 +08:00
output_sequence = tokenizer . encode_plus ( sequence , add_special_tokens = True )
for key in output . keys ( ) :
self . assertEqual ( output [ key ] , output_sequence [ key ] )
# Test batch_encode_plus for pretokenized inputs
sequence_batch = [ sequence . strip ( ) ] * 2 + [ sequence . strip ( ) + " " + sequence . strip ( ) ]
token_sequence_batch = [ s . split ( ) for s in sequence_batch ]
sequence_batch_cleaned_up_spaces = [ " " + " " . join ( s ) for s in token_sequence_batch ]
output = tokenizer . batch_encode_plus (
2020-09-22 21:34:35 +08:00
token_sequence_batch , is_split_into_words = True , add_special_tokens = False
2020-06-16 05:12:51 +08:00
)
output_sequence = tokenizer . batch_encode_plus (
sequence_batch_cleaned_up_spaces , add_special_tokens = False
)
for key in output . keys ( ) :
self . assertEqual ( output [ key ] , output_sequence [ key ] )
output = tokenizer . batch_encode_plus (
2020-09-22 21:34:35 +08:00
token_sequence_batch , is_split_into_words = True , add_special_tokens = True
2020-06-16 05:12:51 +08:00
)
output_sequence = tokenizer . batch_encode_plus (
sequence_batch_cleaned_up_spaces , add_special_tokens = True
)
for key in output . keys ( ) :
self . assertEqual ( output [ key ] , output_sequence [ key ] )
# Test encode for pretokenized inputs pairs
output = tokenizer . encode (
2020-09-22 21:34:35 +08:00
token_sequence , token_sequence , is_split_into_words = True , add_special_tokens = False
2020-06-16 05:12:51 +08:00
)
output_sequence = tokenizer . encode ( sequence , sequence , add_special_tokens = False )
self . assertEqual ( output , output_sequence )
output = tokenizer . encode (
2020-09-22 21:34:35 +08:00
token_sequence , token_sequence , is_split_into_words = True , add_special_tokens = True
2020-06-16 05:12:51 +08:00
)
output_sequence = tokenizer . encode ( sequence , sequence , add_special_tokens = True )
self . assertEqual ( output , output_sequence )
# Test encode_plus for pretokenized inputs pairs
output = tokenizer . encode_plus (
2020-09-22 21:34:35 +08:00
token_sequence , token_sequence , is_split_into_words = True , add_special_tokens = False
2020-06-16 05:12:51 +08:00
)
output_sequence = tokenizer . encode_plus ( sequence , sequence , add_special_tokens = False )
for key in output . keys ( ) :
self . assertEqual ( output [ key ] , output_sequence [ key ] )
output = tokenizer . encode_plus (
2020-09-22 21:34:35 +08:00
token_sequence , token_sequence , is_split_into_words = True , add_special_tokens = True
2020-06-16 05:12:51 +08:00
)
output_sequence = tokenizer . encode_plus ( sequence , sequence , add_special_tokens = True )
for key in output . keys ( ) :
self . assertEqual ( output [ key ] , output_sequence [ key ] )
# Test batch_encode_plus for pretokenized inputs pairs
sequence_pair_batch = [ ( sequence . strip ( ) , sequence . strip ( ) ) ] * 2 + [
( sequence . strip ( ) + " " + sequence . strip ( ) , sequence . strip ( ) )
]
token_sequence_pair_batch = [ tuple ( s . split ( ) for s in pair ) for pair in sequence_pair_batch ]
sequence_pair_batch_cleaned_up_spaces = [
tuple ( " " + " " . join ( s ) for s in pair ) for pair in token_sequence_pair_batch
]
output = tokenizer . batch_encode_plus (
2020-09-22 21:34:35 +08:00
token_sequence_pair_batch , is_split_into_words = True , add_special_tokens = False
2020-06-16 05:12:51 +08:00
)
output_sequence = tokenizer . batch_encode_plus (
sequence_pair_batch_cleaned_up_spaces , add_special_tokens = False
)
for key in output . keys ( ) :
self . assertEqual ( output [ key ] , output_sequence [ key ] )
output = tokenizer . batch_encode_plus (
2020-09-22 21:34:35 +08:00
token_sequence_pair_batch , is_split_into_words = True , add_special_tokens = True
2020-06-16 05:12:51 +08:00
)
output_sequence = tokenizer . batch_encode_plus (
sequence_pair_batch_cleaned_up_spaces , add_special_tokens = True
)
for key in output . keys ( ) :
self . assertEqual ( output [ key ] , output_sequence [ key ] )
2020-02-25 01:09:46 +08:00
2020-07-03 22:51:21 +08:00
def test_prepare_for_model ( self ) :
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
2020-10-08 17:32:16 +08:00
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
string_sequence = " Testing the prepare_for_model method. "
ids = tokenizer . encode ( string_sequence , add_special_tokens = False )
prepared_input_dict = tokenizer . prepare_for_model ( ids , add_special_tokens = True )
input_dict = tokenizer . encode_plus ( string_sequence , add_special_tokens = True )
2020-07-03 22:51:21 +08:00
2020-10-08 17:32:16 +08:00
self . assertEqual ( input_dict , prepared_input_dict )
2020-07-03 22:51:21 +08:00
2020-09-09 18:55:17 +08:00
def test_batch_encode_plus_overflowing_tokens ( self ) :
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
string_sequences = [ " Testing the prepare_for_model method. " , " Test " ]
if tokenizer . pad_token is None :
tokenizer . add_special_tokens ( { " pad_token " : " [PAD] " } )
tokenizer . batch_encode_plus (
string_sequences , return_overflowing_tokens = True , truncation = True , padding = True , max_length = 3
)
2020-10-23 21:58:19 +08:00
@is_pt_tf_cross_test
2020-02-25 01:09:46 +08:00
def test_batch_encode_plus_tensors ( self ) :
2020-06-16 05:12:51 +08:00
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
sequences = [
" Testing batch encode plus " ,
" Testing batch encode plus with different sequence lengths " ,
" Testing batch encode plus with different sequence lengths correctly pads " ,
]
# A Tensor cannot be build by sequences which are not the same size
self . assertRaises ( ValueError , tokenizer . batch_encode_plus , sequences , return_tensors = " pt " )
self . assertRaises ( ValueError , tokenizer . batch_encode_plus , sequences , return_tensors = " tf " )
if tokenizer . pad_token_id is None :
self . assertRaises (
2020-08-26 23:20:22 +08:00
ValueError ,
tokenizer . batch_encode_plus ,
sequences ,
padding = True ,
return_tensors = " pt " ,
2020-06-16 05:12:51 +08:00
)
self . assertRaises (
2020-08-26 23:20:22 +08:00
ValueError ,
tokenizer . batch_encode_plus ,
sequences ,
padding = " longest " ,
return_tensors = " tf " ,
2020-06-16 05:12:51 +08:00
)
else :
pytorch_tensor = tokenizer . batch_encode_plus ( sequences , padding = True , return_tensors = " pt " )
tensorflow_tensor = tokenizer . batch_encode_plus ( sequences , padding = " longest " , return_tensors = " tf " )
encoded_sequences = tokenizer . batch_encode_plus ( sequences , padding = True )
2020-02-25 01:09:46 +08:00
2020-06-16 05:12:51 +08:00
for key in encoded_sequences . keys ( ) :
pytorch_value = pytorch_tensor [ key ] . tolist ( )
tensorflow_value = tensorflow_tensor [ key ] . numpy ( ) . tolist ( )
encoded_value = encoded_sequences [ key ]
2020-02-25 01:09:46 +08:00
2020-06-16 05:12:51 +08:00
self . assertEqual ( pytorch_value , tensorflow_value , encoded_value )
2020-03-02 23:53:55 +08:00
def _check_no_pad_token_padding ( self , tokenizer , sequences ) :
# if tokenizer does not have pad_token_id, an error should be thrown
if tokenizer . pad_token_id is None :
with self . assertRaises ( ValueError ) :
if isinstance ( sequences , list ) :
2020-06-16 05:12:51 +08:00
tokenizer . batch_encode_plus ( sequences , padding = " longest " )
2020-03-02 23:53:55 +08:00
else :
2020-06-16 05:12:51 +08:00
tokenizer . encode_plus ( sequences , padding = True )
2020-03-02 23:53:55 +08:00
# add pad_token_id to pass subsequent tests
tokenizer . add_special_tokens ( { " pad_token " : " <PAD> " } )
2020-04-09 04:22:44 +08:00
2021-05-13 14:44:55 +08:00
def check_subword_sampling (
self ,
tokenizer : PreTrainedTokenizer ,
text : str = None ,
) - > None :
"""
Check if the tokenizer generates different results when subword regularization is enabled .
Subword regularization augments training data with subword sampling .
This has a random component .
Args :
tokenizer : The tokenizer to check .
text : The text to use for the checks .
"""
text = " This is a test for subword regularization. " if text is None else text
if self . test_sentencepiece_ignore_case :
text = text . lower ( )
tokens_list = [ ]
for _ in range ( 5 ) :
tokens_list . append ( tokenizer . tokenize ( text ) )
# the list of different pairs of tokens_list
combinations = itertools . combinations ( tokens_list , 2 )
# check of sampling is done
subword_sampling_found = False
for combination in combinations :
if combination [ 0 ] != combination [ 1 ] :
subword_sampling_found = True
self . assertTrue ( subword_sampling_found )
# check if converting back to original text works
for tokens in tokens_list :
if self . test_sentencepiece_ignore_case :
self . assertEqual ( text , tokenizer . convert_tokens_to_string ( tokens ) . lower ( ) )
else :
self . assertEqual ( text , tokenizer . convert_tokens_to_string ( tokens ) )
2020-04-09 04:22:44 +08:00
@require_torch
2020-09-24 16:54:37 +08:00
@slow
2020-04-09 04:22:44 +08:00
def test_torch_encode_plus_sent_to_model ( self ) :
2020-05-19 22:46:55 +08:00
import torch
2020-08-24 23:03:01 +08:00
2020-04-09 04:22:44 +08:00
from transformers import MODEL_MAPPING , TOKENIZER_MAPPING
MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings ( MODEL_MAPPING , TOKENIZER_MAPPING )
2020-06-16 05:12:51 +08:00
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
2020-04-09 04:22:44 +08:00
2020-06-16 05:12:51 +08:00
if tokenizer . __class__ not in MODEL_TOKENIZER_MAPPING :
return
2020-04-09 04:22:44 +08:00
2020-06-16 05:12:51 +08:00
config_class , model_class = MODEL_TOKENIZER_MAPPING [ tokenizer . __class__ ]
config = config_class ( )
2020-04-09 04:22:44 +08:00
2020-06-16 05:12:51 +08:00
if config . is_encoder_decoder or config . pad_token_id is None :
return
2020-04-09 04:22:44 +08:00
2020-06-16 05:12:51 +08:00
model = model_class ( config )
2020-04-09 04:22:44 +08:00
2020-06-16 05:12:51 +08:00
# Make sure the model contains at least the full vocabulary size in its embedding matrix
is_using_common_embeddings = hasattr ( model . get_input_embeddings ( ) , " weight " )
2022-01-06 22:45:41 +08:00
if is_using_common_embeddings :
self . assertGreaterEqual ( model . get_input_embeddings ( ) . weight . shape [ 0 ] , len ( tokenizer ) )
2020-04-09 04:22:44 +08:00
2020-06-16 05:12:51 +08:00
# Build sequence
first_ten_tokens = list ( tokenizer . get_vocab ( ) . keys ( ) ) [ : 10 ]
sequence = " " . join ( first_ten_tokens )
encoded_sequence = tokenizer . encode_plus ( sequence , return_tensors = " pt " )
2021-01-14 20:57:58 +08:00
# Ensure that the BatchEncoding.to() method works.
encoded_sequence . to ( model . device )
2020-06-16 05:12:51 +08:00
batch_encoded_sequence = tokenizer . batch_encode_plus ( [ sequence , sequence ] , return_tensors = " pt " )
# This should not fail
2020-05-19 22:46:55 +08:00
2020-06-16 05:12:51 +08:00
with torch . no_grad ( ) : # saves some time
model ( * * encoded_sequence )
model ( * * batch_encoded_sequence )
2020-04-09 04:22:44 +08:00
2020-06-16 05:12:51 +08:00
# if self.test_rust_tokenizer:
# fast_tokenizer = self.get_rust_tokenizer()
# encoded_sequence_fast = fast_tokenizer.encode_plus(sequence, return_tensors="pt")
# batch_encoded_sequence_fast = fast_tokenizer.batch_encode_plus([sequence, sequence], return_tensors="pt")
# # This should not fail
# model(**encoded_sequence_fast)
# model(**batch_encoded_sequence_fast)
2020-04-09 04:22:44 +08:00
@require_tf
2020-09-24 16:54:37 +08:00
@slow
2020-04-09 04:22:44 +08:00
def test_tf_encode_plus_sent_to_model ( self ) :
from transformers import TF_MODEL_MAPPING , TOKENIZER_MAPPING
MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings ( TF_MODEL_MAPPING , TOKENIZER_MAPPING )
2020-06-16 05:12:51 +08:00
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
if tokenizer . __class__ not in MODEL_TOKENIZER_MAPPING :
return
2020-04-09 04:22:44 +08:00
2020-06-16 05:12:51 +08:00
config_class , model_class = MODEL_TOKENIZER_MAPPING [ tokenizer . __class__ ]
config = config_class ( )
2020-04-09 04:22:44 +08:00
2020-06-16 05:12:51 +08:00
if config . is_encoder_decoder or config . pad_token_id is None :
return
2020-04-09 04:22:44 +08:00
2020-06-16 05:12:51 +08:00
model = model_class ( config )
2020-04-09 04:22:44 +08:00
2020-06-16 05:12:51 +08:00
# Make sure the model contains at least the full vocabulary size in its embedding matrix
2022-01-06 22:45:41 +08:00
self . assertGreaterEqual ( model . config . vocab_size , len ( tokenizer ) )
2020-04-09 04:22:44 +08:00
2020-06-16 05:12:51 +08:00
# Build sequence
first_ten_tokens = list ( tokenizer . get_vocab ( ) . keys ( ) ) [ : 10 ]
sequence = " " . join ( first_ten_tokens )
encoded_sequence = tokenizer . encode_plus ( sequence , return_tensors = " tf " )
batch_encoded_sequence = tokenizer . batch_encode_plus ( [ sequence , sequence ] , return_tensors = " tf " )
2020-04-09 04:22:44 +08:00
2020-06-16 05:12:51 +08:00
# This should not fail
model ( encoded_sequence )
model ( batch_encoded_sequence )
2020-06-04 12:57:01 +08:00
# TODO: Check if require_torch is the best to test for numpy here ... Maybe move to require_flax when available
@require_torch
2020-09-24 16:54:37 +08:00
@slow
2020-06-04 12:57:01 +08:00
def test_np_encode_plus_sent_to_model ( self ) :
from transformers import MODEL_MAPPING , TOKENIZER_MAPPING
MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings ( MODEL_MAPPING , TOKENIZER_MAPPING )
2021-06-14 17:58:44 +08:00
tokenizers = self . get_tokenizers ( )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
if tokenizer . __class__ not in MODEL_TOKENIZER_MAPPING :
return
2020-06-04 12:57:01 +08:00
2021-06-14 17:58:44 +08:00
config_class , model_class = MODEL_TOKENIZER_MAPPING [ tokenizer . __class__ ]
config = config_class ( )
2020-06-04 12:57:01 +08:00
2021-06-14 17:58:44 +08:00
if config . is_encoder_decoder or config . pad_token_id is None :
return
2020-06-04 12:57:01 +08:00
2021-06-14 17:58:44 +08:00
# Build sequence
first_ten_tokens = list ( tokenizer . get_vocab ( ) . keys ( ) ) [ : 10 ]
sequence = " " . join ( first_ten_tokens )
encoded_sequence = tokenizer . encode_plus ( sequence , return_tensors = " np " )
batch_encoded_sequence = tokenizer . batch_encode_plus ( [ sequence , sequence ] , return_tensors = " np " )
# TODO: add forward through JAX/Flax when PR is merged
# This is currently here to make flake8 happy !
if encoded_sequence is None :
raise ValueError ( " Cannot convert list to numpy tensor on encode_plus() " )
if batch_encoded_sequence is None :
raise ValueError ( " Cannot convert list to numpy tensor on batch_encode_plus() " )
if self . test_rust_tokenizer :
fast_tokenizer = self . get_rust_tokenizer ( )
encoded_sequence_fast = fast_tokenizer . encode_plus ( sequence , return_tensors = " np " )
batch_encoded_sequence_fast = fast_tokenizer . batch_encode_plus (
[ sequence , sequence ] , return_tensors = " np "
)
2020-06-04 12:57:01 +08:00
2021-06-14 17:58:44 +08:00
# TODO: add forward through JAX/Flax when PR is merged
# This is currently here to make flake8 happy !
if encoded_sequence_fast is None :
raise ValueError ( " Cannot convert list to numpy tensor on encode_plus() (fast) " )
2020-06-04 12:57:01 +08:00
2021-06-14 17:58:44 +08:00
if batch_encoded_sequence_fast is None :
raise ValueError ( " Cannot convert list to numpy tensor on batch_encode_plus() (fast) " )
2020-08-12 03:57:07 +08:00
@require_torch
def test_prepare_seq2seq_batch ( self ) :
2021-01-13 07:19:38 +08:00
if not self . test_seq2seq :
return
2021-06-14 17:58:44 +08:00
tokenizers = self . get_tokenizers ( )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
# Longer text that will definitely require truncation.
src_text = [
" UN Chief Says There Is No Military Solution in Syria " ,
" Secretary-General Ban Ki-moon says his response to Russia ' s stepped up military support for Syria is that ' there is no military solution ' to the nearly five-year conflict and more weapons will only worsen the violence and misery for millions of people. " ,
]
tgt_text = [
" Şeful ONU declară că nu există o soluţie militară în Siria " ,
" Secretarul General Ban Ki-moon declară că răspunsul său la intensificarea sprijinului militar al Rusiei "
' pentru Siria este că " nu există o soluţie militară " la conflictul de aproape cinci ani şi că noi arme nu '
" vor face decât să înrăutăţească violenţele şi mizeria pentru milioane de oameni. " ,
]
try :
batch = tokenizer . prepare_seq2seq_batch (
src_texts = src_text ,
tgt_texts = tgt_text ,
max_length = 3 ,
max_target_length = 10 ,
return_tensors = " pt " ,
src_lang = " en_XX " , # this should be ignored (for all but mbart) but not cause an error
)
except NotImplementedError :
return
self . assertEqual ( batch . input_ids . shape [ 1 ] , 3 )
self . assertEqual ( batch . labels . shape [ 1 ] , 10 )
# max_target_length will default to max_length if not specified
batch = tokenizer . prepare_seq2seq_batch (
src_text , tgt_texts = tgt_text , max_length = 3 , return_tensors = " pt "
)
self . assertEqual ( batch . input_ids . shape [ 1 ] , 3 )
self . assertEqual ( batch . labels . shape [ 1 ] , 3 )
2020-08-12 03:57:07 +08:00
2021-06-14 17:58:44 +08:00
batch_encoder_only = tokenizer . prepare_seq2seq_batch (
src_texts = src_text , max_length = 3 , max_target_length = 10 , return_tensors = " pt "
)
self . assertEqual ( batch_encoder_only . input_ids . shape [ 1 ] , 3 )
self . assertEqual ( batch_encoder_only . attention_mask . shape [ 1 ] , 3 )
self . assertNotIn ( " decoder_input_ids " , batch_encoder_only )
2020-10-19 02:51:24 +08:00
def test_is_fast ( self ) :
for tokenizer , pretrained_name , kwargs in self . tokenizers_list :
2021-03-31 22:00:27 +08:00
with self . subTest ( f " { tokenizer . __class__ . __name__ } ( { pretrained_name } ) " ) :
2020-10-19 02:51:24 +08:00
tokenizer_r = self . rust_tokenizer_class . from_pretrained ( pretrained_name , * * kwargs )
# Check is_fast is set correctly
self . assertTrue ( tokenizer_r . is_fast )
2021-06-14 17:58:44 +08:00
if self . test_slow_tokenizer :
tokenizer_p = self . tokenizer_class . from_pretrained ( pretrained_name , * * kwargs )
self . assertFalse ( tokenizer_p . is_fast )
2020-10-19 02:51:24 +08:00
def test_fast_only_inputs ( self ) :
for tokenizer , pretrained_name , kwargs in self . tokenizers_list :
2021-03-31 22:00:27 +08:00
with self . subTest ( f " { tokenizer . __class__ . __name__ } ( { pretrained_name } ) " ) :
2020-10-19 02:51:24 +08:00
tokenizer_r = self . rust_tokenizer_class . from_pretrained ( pretrained_name , * * kwargs )
# Ensure None raise an error
self . assertRaises ( TypeError , tokenizer_r . tokenize , None )
self . assertRaises ( TypeError , tokenizer_r . encode , None )
self . assertRaises ( TypeError , tokenizer_r . encode_plus , None )
self . assertRaises ( TypeError , tokenizer_r . batch_encode_plus , None )
def test_alignement_methods ( self ) :
for tokenizer , pretrained_name , kwargs in self . tokenizers_list :
2021-03-31 22:00:27 +08:00
with self . subTest ( f " { tokenizer . __class__ . __name__ } ( { pretrained_name } ) " ) :
2020-10-19 02:51:24 +08:00
tokenizer_r = self . rust_tokenizer_class . from_pretrained ( pretrained_name , * * kwargs )
words = [ " Wonderful " , " no " , " inspiration " , " example " , " with " , " subtoken " ]
text = " " . join ( words )
batch_size = 3
encoding = tokenizer_r . encode_plus ( text , add_special_tokens = False )
batch_encoding = tokenizer_r . batch_encode_plus ( [ text ] * batch_size , add_special_tokens = False )
num_tokens = len ( encoding [ " input_ids " ] )
last_word_index = len ( words ) - 1
last_token_index = num_tokens - 1
last_batch_index = batch_size - 1
last_char_index = len ( text ) - 1
# words, tokens
self . assertEqual ( len ( encoding . words ( 0 ) ) , num_tokens )
self . assertEqual ( max ( encoding . words ( 0 ) ) , last_word_index )
self . assertEqual ( min ( encoding . words ( 0 ) ) , 0 )
self . assertEqual ( len ( batch_encoding . words ( last_batch_index ) ) , num_tokens )
self . assertEqual ( max ( batch_encoding . words ( last_batch_index ) ) , last_word_index )
self . assertEqual ( min ( batch_encoding . words ( last_batch_index ) ) , 0 )
self . assertEqual ( len ( encoding . tokens ( 0 ) ) , num_tokens )
# Assert token_to_word
self . assertEqual ( encoding . token_to_word ( 0 ) , 0 )
self . assertEqual ( encoding . token_to_word ( 0 , 0 ) , 0 )
self . assertEqual ( encoding . token_to_word ( last_token_index ) , last_word_index )
self . assertEqual ( encoding . token_to_word ( 0 , last_token_index ) , last_word_index )
self . assertEqual ( batch_encoding . token_to_word ( 1 , 0 ) , 0 )
self . assertEqual ( batch_encoding . token_to_word ( 0 , last_token_index ) , last_word_index )
self . assertEqual ( batch_encoding . token_to_word ( last_batch_index , last_token_index ) , last_word_index )
# Assert word_to_tokens
self . assertEqual ( encoding . word_to_tokens ( 0 ) . start , 0 )
self . assertEqual ( encoding . word_to_tokens ( 0 , 0 ) . start , 0 )
self . assertEqual ( encoding . word_to_tokens ( last_word_index ) . end , last_token_index + 1 )
self . assertEqual ( encoding . word_to_tokens ( 0 , last_word_index ) . end , last_token_index + 1 )
self . assertEqual ( batch_encoding . word_to_tokens ( 1 , 0 ) . start , 0 )
self . assertEqual ( batch_encoding . word_to_tokens ( 0 , last_word_index ) . end , last_token_index + 1 )
self . assertEqual (
batch_encoding . word_to_tokens ( last_batch_index , last_word_index ) . end , last_token_index + 1
)
# Assert token_to_chars
self . assertEqual ( encoding . token_to_chars ( 0 ) . start , 0 )
self . assertEqual ( encoding . token_to_chars ( 0 , 0 ) . start , 0 )
self . assertEqual ( encoding . token_to_chars ( last_token_index ) . end , last_char_index + 1 )
self . assertEqual ( encoding . token_to_chars ( 0 , last_token_index ) . end , last_char_index + 1 )
self . assertEqual ( batch_encoding . token_to_chars ( 1 , 0 ) . start , 0 )
self . assertEqual ( batch_encoding . token_to_chars ( 0 , last_token_index ) . end , last_char_index + 1 )
self . assertEqual (
batch_encoding . token_to_chars ( last_batch_index , last_token_index ) . end , last_char_index + 1
)
# Assert char_to_token
self . assertEqual ( encoding . char_to_token ( 0 ) , 0 )
self . assertEqual ( encoding . char_to_token ( 0 , 0 ) , 0 )
self . assertEqual ( encoding . char_to_token ( last_char_index ) , last_token_index )
self . assertEqual ( encoding . char_to_token ( 0 , last_char_index ) , last_token_index )
self . assertEqual ( batch_encoding . char_to_token ( 1 , 0 ) , 0 )
self . assertEqual ( batch_encoding . char_to_token ( 0 , last_char_index ) , last_token_index )
self . assertEqual ( batch_encoding . char_to_token ( last_batch_index , last_char_index ) , last_token_index )
# Assert char_to_word
self . assertEqual ( encoding . char_to_word ( 0 ) , 0 )
self . assertEqual ( encoding . char_to_word ( 0 , 0 ) , 0 )
self . assertEqual ( encoding . char_to_word ( last_char_index ) , last_word_index )
self . assertEqual ( encoding . char_to_word ( 0 , last_char_index ) , last_word_index )
self . assertEqual ( batch_encoding . char_to_word ( 1 , 0 ) , 0 )
self . assertEqual ( batch_encoding . char_to_word ( 0 , last_char_index ) , last_word_index )
self . assertEqual ( batch_encoding . char_to_word ( last_batch_index , last_char_index ) , last_word_index )
# Assert word_to_chars
self . assertEqual ( encoding . word_to_chars ( 0 ) . start , 0 )
self . assertEqual ( encoding . word_to_chars ( 0 , 0 ) . start , 0 )
self . assertEqual ( encoding . word_to_chars ( last_word_index ) . end , last_char_index + 1 )
self . assertEqual ( encoding . word_to_chars ( 0 , last_word_index ) . end , last_char_index + 1 )
self . assertEqual ( batch_encoding . word_to_chars ( 1 , 0 ) . start , 0 )
self . assertEqual ( batch_encoding . word_to_chars ( 0 , last_word_index ) . end , last_char_index + 1 )
self . assertEqual (
batch_encoding . word_to_chars ( last_batch_index , last_word_index ) . end , last_char_index + 1
)
2020-11-16 05:50:59 +08:00
# Assert token_to_sequence
self . assertEqual ( encoding . token_to_sequence ( num_tokens / / 2 ) , 0 )
self . assertEqual ( encoding . token_to_sequence ( 0 , num_tokens / / 2 ) , 0 )
self . assertEqual ( batch_encoding . token_to_sequence ( 1 , num_tokens / / 2 ) , 0 )
self . assertEqual ( batch_encoding . token_to_sequence ( 0 , num_tokens / / 2 ) , 0 )
self . assertEqual ( batch_encoding . token_to_sequence ( last_batch_index , num_tokens / / 2 ) , 0 )
# Pair of input sequences
words = [ " Wonderful " , " no " , " inspiration " , " example " , " with " , " subtoken " ]
text = " " . join ( words )
pair_words = [ " Amazing " , " example " , " full " , " of " , " inspiration " ]
pair_text = " " . join ( pair_words )
batch_size = 3
index_word_in_first_seq = words . index ( " inspiration " )
index_word_in_pair_seq = pair_words . index ( " inspiration " )
index_char_in_first_seq = text . find ( " inspiration " )
index_char_in_pair_seq = pair_text . find ( " inspiration " )
pair_encoding = tokenizer_r . encode_plus ( text , pair_text , add_special_tokens = False )
pair_batch_encoding = tokenizer_r . batch_encode_plus (
[ ( text , pair_text ) ] * batch_size , add_special_tokens = False
)
num_tokens = len ( encoding [ " input_ids " ] )
last_word_index = len ( words ) - 1
last_token_index = num_tokens - 1
last_batch_index = batch_size - 1
last_char_index = len ( text ) - 1
# Assert word_to_tokens
self . assertNotEqual (
pair_encoding . word_to_tokens ( index_word_in_first_seq , sequence_index = 0 ) . start ,
pair_encoding . word_to_tokens ( index_word_in_pair_seq , sequence_index = 1 ) . start ,
)
self . assertEqual (
pair_encoding [ " input_ids " ] [
pair_encoding . word_to_tokens ( index_word_in_first_seq , sequence_index = 0 ) . start
] ,
pair_encoding [ " input_ids " ] [
pair_encoding . word_to_tokens ( index_word_in_pair_seq , sequence_index = 1 ) . start
] ,
)
self . assertNotEqual (
pair_batch_encoding . word_to_tokens ( 1 , index_word_in_first_seq , sequence_index = 0 ) . start ,
pair_batch_encoding . word_to_tokens ( 1 , index_word_in_pair_seq , sequence_index = 1 ) . start ,
)
self . assertEqual (
pair_batch_encoding [ " input_ids " ] [ 1 ] [
pair_batch_encoding . word_to_tokens ( 1 , index_word_in_first_seq , sequence_index = 0 ) . start
] ,
pair_batch_encoding [ " input_ids " ] [ 1 ] [
pair_batch_encoding . word_to_tokens ( 1 , index_word_in_pair_seq , sequence_index = 1 ) . start
] ,
)
# Assert char_to_token
self . assertNotEqual (
pair_encoding . char_to_token ( index_char_in_first_seq , sequence_index = 0 ) ,
pair_encoding . char_to_token ( index_char_in_pair_seq , sequence_index = 1 ) ,
)
self . assertEqual (
pair_encoding [ " input_ids " ] [ pair_encoding . char_to_token ( index_char_in_first_seq , sequence_index = 0 ) ] ,
pair_encoding [ " input_ids " ] [ pair_encoding . char_to_token ( index_char_in_pair_seq , sequence_index = 1 ) ] ,
)
self . assertNotEqual (
pair_batch_encoding . char_to_token ( 1 , index_char_in_first_seq , sequence_index = 0 ) ,
pair_batch_encoding . char_to_token ( 1 , index_char_in_pair_seq , sequence_index = 1 ) ,
)
self . assertEqual (
pair_batch_encoding [ " input_ids " ] [ 1 ] [
pair_batch_encoding . char_to_token ( 1 , index_char_in_first_seq , sequence_index = 0 )
] ,
pair_batch_encoding [ " input_ids " ] [ 1 ] [
pair_batch_encoding . char_to_token ( 1 , index_char_in_pair_seq , sequence_index = 1 )
] ,
)
# Assert char_to_word
self . assertNotEqual (
pair_encoding . char_to_word ( index_char_in_first_seq , sequence_index = 0 ) ,
pair_encoding . char_to_word ( index_char_in_pair_seq , sequence_index = 1 ) ,
)
self . assertEqual (
words [ pair_encoding . char_to_word ( index_char_in_first_seq , sequence_index = 0 ) ] ,
pair_words [ pair_encoding . char_to_word ( index_char_in_pair_seq , sequence_index = 1 ) ] ,
)
self . assertNotEqual (
pair_batch_encoding . char_to_word ( 1 , index_char_in_first_seq , sequence_index = 0 ) ,
pair_batch_encoding . char_to_word ( 1 , index_char_in_pair_seq , sequence_index = 1 ) ,
)
self . assertEqual (
words [ pair_batch_encoding . char_to_word ( 1 , index_char_in_first_seq , sequence_index = 0 ) ] ,
pair_words [ pair_batch_encoding . char_to_word ( 1 , index_char_in_pair_seq , sequence_index = 1 ) ] ,
)
# Assert word_to_chars
self . assertNotEqual (
pair_encoding . word_to_chars ( index_word_in_first_seq , sequence_index = 0 ) . start ,
pair_encoding . word_to_chars ( index_word_in_pair_seq , sequence_index = 1 ) . start ,
)
self . assertEqual (
text [ pair_encoding . word_to_chars ( index_word_in_first_seq , sequence_index = 0 ) . start ] ,
pair_text [ pair_encoding . word_to_chars ( index_word_in_pair_seq , sequence_index = 1 ) . start ] ,
)
self . assertNotEqual (
pair_batch_encoding . word_to_chars ( 1 , index_word_in_first_seq , sequence_index = 0 ) . start ,
pair_batch_encoding . word_to_chars ( 1 , index_word_in_pair_seq , sequence_index = 1 ) . start ,
)
self . assertEqual (
text [ pair_batch_encoding . word_to_chars ( 1 , index_word_in_first_seq , sequence_index = 0 ) . start ] ,
pair_text [ pair_batch_encoding . word_to_chars ( 1 , index_word_in_pair_seq , sequence_index = 1 ) . start ] ,
)
# Assert token_to_sequence
pair_encoding = tokenizer_r . encode_plus ( text , pair_text , add_special_tokens = True )
pair_sequence_ids = [
pair_encoding . token_to_sequence ( i ) for i in range ( len ( pair_encoding [ " input_ids " ] ) )
]
self . assertIn ( 0 , pair_sequence_ids )
self . assertIn ( 1 , pair_sequence_ids )
if tokenizer_r . num_special_tokens_to_add ( pair = True ) :
self . assertIn ( None , pair_sequence_ids )
pair_batch_encoding = tokenizer_r . batch_encode_plus (
[ ( text , pair_text ) ] * batch_size , add_special_tokens = True
)
pair_batch_sequence_ids = [
pair_batch_encoding . token_to_sequence ( 1 , i )
for i in range ( len ( pair_batch_encoding [ " input_ids " ] [ 0 ] ) )
]
self . assertIn ( 0 , pair_batch_sequence_ids )
self . assertIn ( 1 , pair_batch_sequence_ids )
if tokenizer_r . num_special_tokens_to_add ( pair = True ) :
self . assertIn ( None , pair_batch_sequence_ids )
2020-10-19 02:51:24 +08:00
def test_tokenization_python_rust_equals ( self ) :
2021-06-14 17:58:44 +08:00
if not self . test_slow_tokenizer :
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return
2020-10-19 02:51:24 +08:00
for tokenizer , pretrained_name , kwargs in self . tokenizers_list :
2021-03-31 22:00:27 +08:00
with self . subTest ( f " { tokenizer . __class__ . __name__ } ( { pretrained_name } ) " ) :
2020-10-19 02:51:24 +08:00
tokenizer_r = self . rust_tokenizer_class . from_pretrained ( pretrained_name , * * kwargs )
tokenizer_p = self . tokenizer_class . from_pretrained ( pretrained_name , * * kwargs )
# Ensure basic input match
input_p = tokenizer_p . encode_plus ( self . _data )
input_r = tokenizer_r . encode_plus ( self . _data )
for key in filter ( lambda x : x in [ " input_ids " , " token_type_ids " , " attention_mask " ] , input_p . keys ( ) ) :
self . assertSequenceEqual ( input_p [ key ] , input_r [ key ] )
input_pairs_p = tokenizer_p . encode_plus ( self . _data , self . _data )
input_pairs_r = tokenizer_r . encode_plus ( self . _data , self . _data )
for key in filter ( lambda x : x in [ " input_ids " , " token_type_ids " , " attention_mask " ] , input_p . keys ( ) ) :
self . assertSequenceEqual ( input_pairs_p [ key ] , input_pairs_r [ key ] )
# Ensure truncation match
input_p = tokenizer_p . encode_plus ( self . _data , max_length = 512 , truncation = True )
input_r = tokenizer_r . encode_plus ( self . _data , max_length = 512 , truncation = True )
for key in filter ( lambda x : x in [ " input_ids " , " token_type_ids " , " attention_mask " ] , input_p . keys ( ) ) :
self . assertSequenceEqual ( input_p [ key ] , input_r [ key ] )
# Ensure truncation with stride match
input_p = tokenizer_p . encode_plus (
self . _data , max_length = 512 , truncation = True , stride = 3 , return_overflowing_tokens = True
)
input_r = tokenizer_r . encode_plus (
self . _data , max_length = 512 , truncation = True , stride = 3 , return_overflowing_tokens = True
)
for key in filter ( lambda x : x in [ " input_ids " , " token_type_ids " , " attention_mask " ] , input_p . keys ( ) ) :
self . assertSequenceEqual ( input_p [ key ] , input_r [ key ] [ 0 ] )
def test_num_special_tokens_to_add_equal ( self ) :
2021-06-14 17:58:44 +08:00
if not self . test_slow_tokenizer :
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return
2020-10-19 02:51:24 +08:00
for tokenizer , pretrained_name , kwargs in self . tokenizers_list :
2021-03-31 22:00:27 +08:00
with self . subTest ( f " { tokenizer . __class__ . __name__ } ( { pretrained_name } ) " ) :
2020-10-19 02:51:24 +08:00
tokenizer_r = self . rust_tokenizer_class . from_pretrained ( pretrained_name , * * kwargs )
tokenizer_p = self . tokenizer_class . from_pretrained ( pretrained_name , * * kwargs )
# Check we have the same number of added_tokens for both pair and non-pair inputs.
self . assertEqual (
tokenizer_r . num_special_tokens_to_add ( False ) , tokenizer_p . num_special_tokens_to_add ( False )
)
self . assertEqual (
tokenizer_r . num_special_tokens_to_add ( True ) , tokenizer_p . num_special_tokens_to_add ( True )
)
def test_max_length_equal ( self ) :
2021-06-14 17:58:44 +08:00
if not self . test_slow_tokenizer :
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return
2020-10-19 02:51:24 +08:00
for tokenizer , pretrained_name , kwargs in self . tokenizers_list :
2021-03-31 22:00:27 +08:00
with self . subTest ( f " { tokenizer . __class__ . __name__ } ( { pretrained_name } ) " ) :
2020-10-19 02:51:24 +08:00
tokenizer_r = self . rust_tokenizer_class . from_pretrained ( pretrained_name , * * kwargs )
tokenizer_p = self . tokenizer_class . from_pretrained ( pretrained_name , * * kwargs )
# Check we have the correct max_length for both pair and non-pair inputs.
self . assertEqual ( tokenizer_r . max_len_single_sentence , tokenizer_p . max_len_single_sentence )
self . assertEqual ( tokenizer_r . max_len_sentences_pair , tokenizer_p . max_len_sentences_pair )
def test_special_tokens_map_equal ( self ) :
2021-06-14 17:58:44 +08:00
if not self . test_slow_tokenizer :
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return
2020-10-19 02:51:24 +08:00
for tokenizer , pretrained_name , kwargs in self . tokenizers_list :
2021-03-31 22:00:27 +08:00
with self . subTest ( f " { tokenizer . __class__ . __name__ } ( { pretrained_name } ) " ) :
2020-10-19 02:51:24 +08:00
tokenizer_r = self . rust_tokenizer_class . from_pretrained ( pretrained_name , * * kwargs )
tokenizer_p = self . tokenizer_class . from_pretrained ( pretrained_name , * * kwargs )
# Assert the set of special tokens match.
self . assertSequenceEqual (
tokenizer_p . special_tokens_map . items ( ) ,
tokenizer_r . special_tokens_map . items ( ) ,
)
def test_add_tokens ( self ) :
for tokenizer , pretrained_name , kwargs in self . tokenizers_list :
2021-03-31 22:00:27 +08:00
with self . subTest ( f " { tokenizer . __class__ . __name__ } ( { pretrained_name } ) " ) :
2020-10-19 02:51:24 +08:00
tokenizer_r = self . rust_tokenizer_class . from_pretrained ( pretrained_name , * * kwargs )
vocab_size = len ( tokenizer_r )
self . assertEqual ( tokenizer_r . add_tokens ( " " ) , 0 )
self . assertEqual ( tokenizer_r . add_tokens ( " testoken " ) , 1 )
self . assertEqual ( tokenizer_r . add_tokens ( [ " testoken1 " , " testtoken2 " ] ) , 2 )
self . assertEqual ( len ( tokenizer_r ) , vocab_size + 3 )
self . assertEqual ( tokenizer_r . add_special_tokens ( { } ) , 0 )
self . assertEqual ( tokenizer_r . add_special_tokens ( { " bos_token " : " [BOS] " , " eos_token " : " [EOS] " } ) , 2 )
self . assertRaises (
AssertionError , tokenizer_r . add_special_tokens , { " additional_special_tokens " : " <testtoken1> " }
)
self . assertEqual ( tokenizer_r . add_special_tokens ( { " additional_special_tokens " : [ " <testtoken2> " ] } ) , 1 )
self . assertEqual (
tokenizer_r . add_special_tokens ( { " additional_special_tokens " : [ " <testtoken3> " , " <testtoken4> " ] } ) , 2
)
2021-07-17 00:26:54 +08:00
self . assertIn ( " <testtoken3> " , tokenizer_r . special_tokens_map [ " additional_special_tokens " ] )
self . assertIsInstance ( tokenizer_r . special_tokens_map [ " additional_special_tokens " ] , list )
self . assertGreaterEqual ( len ( tokenizer_r . special_tokens_map [ " additional_special_tokens " ] ) , 2 )
2020-10-19 02:51:24 +08:00
self . assertEqual ( len ( tokenizer_r ) , vocab_size + 8 )
def test_offsets_mapping ( self ) :
for tokenizer , pretrained_name , kwargs in self . tokenizers_list :
2021-03-31 22:00:27 +08:00
with self . subTest ( f " { tokenizer . __class__ . __name__ } ( { pretrained_name } ) " ) :
2020-10-19 02:51:24 +08:00
tokenizer_r = self . rust_tokenizer_class . from_pretrained ( pretrained_name , * * kwargs )
text = " Wonderful no inspiration example with subtoken "
pair = " Along with an awesome pair "
# No pair
tokens_with_offsets = tokenizer_r . encode_plus (
text , return_special_tokens_mask = True , return_offsets_mapping = True , add_special_tokens = True
)
added_tokens = tokenizer_r . num_special_tokens_to_add ( False )
offsets = tokens_with_offsets [ " offset_mapping " ]
# Assert there is the same number of tokens and offsets
self . assertEqual ( len ( offsets ) , len ( tokens_with_offsets [ " input_ids " ] ) )
# Assert there is online added_tokens special_tokens
self . assertEqual ( sum ( tokens_with_offsets [ " special_tokens_mask " ] ) , added_tokens )
# Pairs
tokens_with_offsets = tokenizer_r . encode_plus (
text , pair , return_special_tokens_mask = True , return_offsets_mapping = True , add_special_tokens = True
)
added_tokens = tokenizer_r . num_special_tokens_to_add ( True )
offsets = tokens_with_offsets [ " offset_mapping " ]
# Assert there is the same number of tokens and offsets
self . assertEqual ( len ( offsets ) , len ( tokens_with_offsets [ " input_ids " ] ) )
# Assert there is online added_tokens special_tokens
self . assertEqual ( sum ( tokens_with_offsets [ " special_tokens_mask " ] ) , added_tokens )
def test_batch_encode_dynamic_overflowing ( self ) :
"""
When calling batch_encode with multiple sequence it can returns different number of
overflowing encoding for each sequence :
[
Sequence 1 : [ Encoding 1 , Encoding 2 ] ,
Sequence 2 : [ Encoding 1 ] ,
Sequence 3 : [ Encoding 1 , Encoding 2 , . . . Encoding N ]
]
This needs to be padded so that it can represented as a tensor
"""
for tokenizer , pretrained_name , kwargs in self . tokenizers_list :
tokenizer = self . rust_tokenizer_class . from_pretrained ( pretrained_name , * * kwargs )
2021-03-31 22:00:27 +08:00
with self . subTest ( f " { tokenizer . __class__ . __name__ } ( { pretrained_name } , { tokenizer . __class__ . __name__ } ) " ) :
2020-10-19 02:51:24 +08:00
2021-03-16 13:05:37 +08:00
if is_torch_available ( ) :
returned_tensor = " pt "
elif is_tf_available ( ) :
returned_tensor = " tf "
else :
returned_tensor = " jax "
2020-10-19 02:51:24 +08:00
if not tokenizer . pad_token or tokenizer . pad_token_id < 0 :
return
tokens = tokenizer . encode_plus (
" HuggingFace is solving NLP one commit at a time " ,
max_length = 6 ,
padding = True ,
truncation = True ,
return_tensors = returned_tensor ,
return_overflowing_tokens = True ,
)
for key in filter ( lambda x : " overflow_to_sample_mapping " not in x , tokens . keys ( ) ) :
self . assertEqual ( len ( tokens [ key ] . shape ) , 2 )
# Mono sample
tokens = tokenizer . batch_encode_plus (
[ " HuggingFace is solving NLP one commit at a time " ] ,
max_length = 6 ,
padding = True ,
truncation = " only_first " ,
return_tensors = returned_tensor ,
return_overflowing_tokens = True ,
)
for key in filter ( lambda x : " overflow_to_sample_mapping " not in x , tokens . keys ( ) ) :
self . assertEqual ( len ( tokens [ key ] . shape ) , 2 )
self . assertEqual ( tokens [ key ] . shape [ - 1 ] , 6 )
# Multi sample
tokens = tokenizer . batch_encode_plus (
[ " HuggingFace is solving NLP one commit at a time " , " Very tiny input " ] ,
max_length = 6 ,
padding = True ,
truncation = " only_first " ,
return_tensors = returned_tensor ,
return_overflowing_tokens = True ,
)
for key in filter ( lambda x : " overflow_to_sample_mapping " not in x , tokens . keys ( ) ) :
self . assertEqual ( len ( tokens [ key ] . shape ) , 2 )
self . assertEqual ( tokens [ key ] . shape [ - 1 ] , 6 )
def test_compare_pretokenized_inputs ( self ) :
2021-06-14 17:58:44 +08:00
if not self . test_slow_tokenizer :
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return
2020-10-19 02:51:24 +08:00
for tokenizer , pretrained_name , kwargs in self . tokenizers_list :
2021-03-31 22:00:27 +08:00
with self . subTest ( f " { tokenizer . __class__ . __name__ } ( { pretrained_name } ) " ) :
2020-10-19 02:51:24 +08:00
tokenizer_r = self . rust_tokenizer_class . from_pretrained ( pretrained_name , * * kwargs )
tokenizer_p = self . tokenizer_class . from_pretrained ( pretrained_name , * * kwargs )
if hasattr ( tokenizer_p , " add_prefix_space " ) and not tokenizer_p . add_prefix_space :
continue # Too hard to test for now
# Input string
pretokenized_input_simple = " This is a sample input " . split ( )
pretokenized_input_pair = " This is a sample pair " . split ( )
# Test encode for pretokenized inputs
output_r = tokenizer_r . encode (
pretokenized_input_simple , is_split_into_words = True , add_special_tokens = False
)
output_p = tokenizer_p . encode (
pretokenized_input_simple , is_split_into_words = True , add_special_tokens = False
)
self . assertEqual ( output_p , output_r )
kwargs = {
" is_split_into_words " : True ,
# "return_token_type_ids": True, # Use the defaults for each tokenizers
# "return_attention_mask": True, # Use the defaults for each tokenizers
" return_overflowing_tokens " : False ,
" return_special_tokens_mask " : True ,
" return_offsets_mapping " : False , # Not implemented in python tokenizers
# "add_special_tokens": False,
}
batch_kwargs = {
" is_split_into_words " : True ,
# "return_token_type_ids": True, # Use the defaults for each tokenizers
# "return_attention_mask": True, # Use the defaults for each tokenizers
" return_overflowing_tokens " : False ,
" return_special_tokens_mask " : True ,
" return_offsets_mapping " : False , # Not implemented in python tokenizers
# "add_special_tokens": False,
}
# Test encode_plus for pretokenized inputs
output_r = tokenizer_r . encode_plus ( pretokenized_input_simple , * * kwargs )
output_p = tokenizer_p . encode_plus ( pretokenized_input_simple , * * kwargs )
for key in output_p . keys ( ) :
self . assertEqual ( output_p [ key ] , output_r [ key ] )
# Test batch_encode_plus for pretokenized inputs
input_batch = ( [ pretokenized_input_simple ] * 2 ) + [ pretokenized_input_simple + pretokenized_input_pair ]
output_r = tokenizer_r . batch_encode_plus ( input_batch , * * batch_kwargs )
output_p = tokenizer_p . batch_encode_plus ( input_batch , * * batch_kwargs )
for key in output_p . keys ( ) :
self . assertEqual ( output_p [ key ] , output_r [ key ] )
# Test encode for pretokenized inputs pairs
output_r = tokenizer_r . encode (
pretokenized_input_simple , pretokenized_input_pair , is_split_into_words = True
)
output_p = tokenizer_p . encode (
pretokenized_input_simple , pretokenized_input_pair , is_split_into_words = True
)
self . assertEqual ( output_p , output_r )
# Test encode_plus for pretokenized inputs
output_r = tokenizer_r . encode_plus ( pretokenized_input_simple , pretokenized_input_pair , * * kwargs )
output_p = tokenizer_p . encode_plus ( pretokenized_input_simple , pretokenized_input_pair , * * kwargs )
for key in output_p . keys ( ) :
self . assertEqual ( output_p [ key ] , output_r [ key ] )
# Test batch_encode_plus for pretokenized inputs
input_batch_pair = ( [ pretokenized_input_simple , pretokenized_input_pair ] * 2 ) + [
pretokenized_input_simple + pretokenized_input_pair ,
pretokenized_input_pair ,
]
output_r = tokenizer_r . batch_encode_plus ( input_batch_pair , * * batch_kwargs )
output_p = tokenizer_p . batch_encode_plus ( input_batch_pair , * * batch_kwargs )
for key in output_p . keys ( ) :
self . assertEqual ( output_p [ key ] , output_r [ key ] )
def test_create_token_type_ids ( self ) :
2021-06-14 17:58:44 +08:00
if not self . test_slow_tokenizer :
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return
2020-10-19 02:51:24 +08:00
for tokenizer , pretrained_name , kwargs in self . tokenizers_list :
2021-03-31 22:00:27 +08:00
with self . subTest ( f " { tokenizer . __class__ . __name__ } ( { pretrained_name } ) " ) :
2020-10-19 02:51:24 +08:00
tokenizer_r = self . rust_tokenizer_class . from_pretrained ( pretrained_name , * * kwargs )
tokenizer_p = self . tokenizer_class . from_pretrained ( pretrained_name , * * kwargs )
input_simple = [ 1 , 2 , 3 ]
input_pair = [ 1 , 2 , 3 ]
# Generate output
output_r = tokenizer_r . create_token_type_ids_from_sequences ( input_simple )
output_p = tokenizer_p . create_token_type_ids_from_sequences ( input_simple )
self . assertEqual ( output_p , output_r )
# Generate pair output
output_r = tokenizer_r . create_token_type_ids_from_sequences ( input_simple , input_pair )
output_p = tokenizer_p . create_token_type_ids_from_sequences ( input_simple , input_pair )
self . assertEqual ( output_p , output_r )
def test_build_inputs_with_special_tokens ( self ) :
2021-06-14 17:58:44 +08:00
if not self . test_slow_tokenizer :
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return
2020-10-19 02:51:24 +08:00
for tokenizer , pretrained_name , kwargs in self . tokenizers_list :
2021-03-31 22:00:27 +08:00
with self . subTest ( f " { tokenizer . __class__ . __name__ } ( { pretrained_name } ) " ) :
2020-10-19 02:51:24 +08:00
tokenizer_r = self . rust_tokenizer_class . from_pretrained ( pretrained_name , * * kwargs )
tokenizer_p = self . tokenizer_class . from_pretrained ( pretrained_name , * * kwargs )
# # Input string
# input_simple = tokenizer_p.tokenize("This is a sample input", add_special_tokens=False)
# input_pair = tokenizer_p.tokenize("This is a sample pair", add_special_tokens=False)
# # Generate output
# output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple)
# output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple)
# self.assertEqual(output_p, output_r)
# # Generate pair output
# output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair)
# output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair)
# self.assertEqual(output_p, output_r)
# Input tokens id
input_simple = tokenizer_p . encode ( " This is a sample input " , add_special_tokens = False )
input_pair = tokenizer_p . encode ( " This is a sample pair " , add_special_tokens = False )
# Generate output
output_r = tokenizer_r . build_inputs_with_special_tokens ( input_simple )
output_p = tokenizer_p . build_inputs_with_special_tokens ( input_simple )
self . assertEqual ( output_p , output_r )
# Generate pair output
output_r = tokenizer_r . build_inputs_with_special_tokens ( input_simple , input_pair )
output_p = tokenizer_p . build_inputs_with_special_tokens ( input_simple , input_pair )
self . assertEqual ( output_p , output_r )
def test_padding ( self , max_length = 50 ) :
2021-06-14 17:58:44 +08:00
if not self . test_slow_tokenizer :
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return
2020-10-19 02:51:24 +08:00
for tokenizer , pretrained_name , kwargs in self . tokenizers_list :
2021-03-31 22:00:27 +08:00
with self . subTest ( f " { tokenizer . __class__ . __name__ } ( { pretrained_name } ) " ) :
2020-10-19 02:51:24 +08:00
tokenizer_r = self . rust_tokenizer_class . from_pretrained ( pretrained_name , * * kwargs )
tokenizer_p = self . tokenizer_class . from_pretrained ( pretrained_name , * * kwargs )
2021-02-02 15:35:27 +08:00
self . assertEqual ( tokenizer_p . pad_token_id , tokenizer_r . pad_token_id )
pad_token_id = tokenizer_p . pad_token_id
2020-10-19 02:51:24 +08:00
# Encode - Simple input
input_r = tokenizer_r . encode ( " This is a simple input " , max_length = max_length , pad_to_max_length = True )
input_p = tokenizer_p . encode ( " This is a simple input " , max_length = max_length , pad_to_max_length = True )
2021-02-02 15:35:27 +08:00
self . assert_padded_input_match ( input_r , input_p , max_length , pad_token_id )
2020-10-19 02:51:24 +08:00
input_r = tokenizer_r . encode ( " This is a simple input " , max_length = max_length , padding = " max_length " )
input_p = tokenizer_p . encode ( " This is a simple input " , max_length = max_length , padding = " max_length " )
2021-02-02 15:35:27 +08:00
self . assert_padded_input_match ( input_r , input_p , max_length , pad_token_id )
2020-10-19 02:51:24 +08:00
input_r = tokenizer_r . encode ( " This is a simple input " , padding = " longest " )
input_p = tokenizer_p . encode ( " This is a simple input " , padding = True )
2021-02-02 15:35:27 +08:00
self . assert_padded_input_match ( input_r , input_p , len ( input_r ) , pad_token_id )
2020-10-19 02:51:24 +08:00
# Encode - Pair input
input_r = tokenizer_r . encode (
" This is a simple input " , " This is a pair " , max_length = max_length , pad_to_max_length = True
)
input_p = tokenizer_p . encode (
" This is a simple input " , " This is a pair " , max_length = max_length , pad_to_max_length = True
)
2021-02-02 15:35:27 +08:00
self . assert_padded_input_match ( input_r , input_p , max_length , pad_token_id )
2020-10-19 02:51:24 +08:00
input_r = tokenizer_r . encode (
" This is a simple input " , " This is a pair " , max_length = max_length , padding = " max_length "
)
input_p = tokenizer_p . encode (
" This is a simple input " , " This is a pair " , max_length = max_length , padding = " max_length "
)
2021-02-02 15:35:27 +08:00
self . assert_padded_input_match ( input_r , input_p , max_length , pad_token_id )
2020-10-19 02:51:24 +08:00
input_r = tokenizer_r . encode ( " This is a simple input " , " This is a pair " , padding = True )
input_p = tokenizer_p . encode ( " This is a simple input " , " This is a pair " , padding = " longest " )
2021-02-02 15:35:27 +08:00
self . assert_padded_input_match ( input_r , input_p , len ( input_r ) , pad_token_id )
2020-10-19 02:51:24 +08:00
# Encode_plus - Simple input
input_r = tokenizer_r . encode_plus (
" This is a simple input " , max_length = max_length , pad_to_max_length = True
)
input_p = tokenizer_p . encode_plus (
" This is a simple input " , max_length = max_length , pad_to_max_length = True
)
2021-02-02 15:35:27 +08:00
self . assert_padded_input_match ( input_r [ " input_ids " ] , input_p [ " input_ids " ] , max_length , pad_token_id )
2020-10-19 02:51:24 +08:00
self . assertSequenceEqual ( input_r [ " attention_mask " ] , input_p [ " attention_mask " ] )
input_r = tokenizer_r . encode_plus (
" This is a simple input " , max_length = max_length , padding = " max_length "
)
input_p = tokenizer_p . encode_plus (
" This is a simple input " , max_length = max_length , padding = " max_length "
)
2021-02-02 15:35:27 +08:00
self . assert_padded_input_match ( input_r [ " input_ids " ] , input_p [ " input_ids " ] , max_length , pad_token_id )
2020-10-19 02:51:24 +08:00
self . assertSequenceEqual ( input_r [ " attention_mask " ] , input_p [ " attention_mask " ] )
input_r = tokenizer_r . encode_plus ( " This is a simple input " , padding = " longest " )
input_p = tokenizer_p . encode_plus ( " This is a simple input " , padding = True )
2021-02-02 15:35:27 +08:00
self . assert_padded_input_match (
input_r [ " input_ids " ] , input_p [ " input_ids " ] , len ( input_r [ " input_ids " ] ) , pad_token_id
)
2020-10-19 02:51:24 +08:00
self . assertSequenceEqual ( input_r [ " attention_mask " ] , input_p [ " attention_mask " ] )
# Encode_plus - Pair input
input_r = tokenizer_r . encode_plus (
" This is a simple input " , " This is a pair " , max_length = max_length , pad_to_max_length = True
)
input_p = tokenizer_p . encode_plus (
" This is a simple input " , " This is a pair " , max_length = max_length , pad_to_max_length = True
)
2021-02-02 15:35:27 +08:00
self . assert_padded_input_match ( input_r [ " input_ids " ] , input_p [ " input_ids " ] , max_length , pad_token_id )
2020-10-19 02:51:24 +08:00
self . assertSequenceEqual ( input_r [ " attention_mask " ] , input_p [ " attention_mask " ] )
input_r = tokenizer_r . encode_plus (
" This is a simple input " , " This is a pair " , max_length = max_length , padding = " max_length "
)
input_p = tokenizer_p . encode_plus (
" This is a simple input " , " This is a pair " , max_length = max_length , padding = " max_length "
)
2021-02-02 15:35:27 +08:00
self . assert_padded_input_match ( input_r [ " input_ids " ] , input_p [ " input_ids " ] , max_length , pad_token_id )
2020-10-19 02:51:24 +08:00
self . assertSequenceEqual ( input_r [ " attention_mask " ] , input_p [ " attention_mask " ] )
input_r = tokenizer_r . encode_plus ( " This is a simple input " , " This is a pair " , padding = " longest " )
input_p = tokenizer_p . encode_plus ( " This is a simple input " , " This is a pair " , padding = True )
2021-02-02 15:35:27 +08:00
self . assert_padded_input_match (
input_r [ " input_ids " ] , input_p [ " input_ids " ] , len ( input_r [ " input_ids " ] ) , pad_token_id
)
2020-10-19 02:51:24 +08:00
self . assertSequenceEqual ( input_r [ " attention_mask " ] , input_p [ " attention_mask " ] )
# Batch_encode_plus - Simple input
input_r = tokenizer_r . batch_encode_plus (
[ " This is a simple input 1 " , " This is a simple input 2 " ] ,
max_length = max_length ,
pad_to_max_length = True ,
)
input_p = tokenizer_p . batch_encode_plus (
[ " This is a simple input 1 " , " This is a simple input 2 " ] ,
max_length = max_length ,
pad_to_max_length = True ,
)
2021-02-02 15:35:27 +08:00
self . assert_batch_padded_input_match ( input_r , input_p , max_length , pad_token_id )
2020-10-19 02:51:24 +08:00
input_r = tokenizer_r . batch_encode_plus (
[ " This is a simple input 1 " , " This is a simple input 2 " ] ,
max_length = max_length ,
padding = " max_length " ,
)
input_p = tokenizer_p . batch_encode_plus (
[ " This is a simple input 1 " , " This is a simple input 2 " ] ,
max_length = max_length ,
padding = " max_length " ,
)
2021-02-02 15:35:27 +08:00
self . assert_batch_padded_input_match ( input_r , input_p , max_length , pad_token_id )
2020-10-19 02:51:24 +08:00
input_r = tokenizer_r . batch_encode_plus (
[ " This is a simple input 1 " , " This is a simple input 2 " ] ,
max_length = max_length ,
padding = " longest " ,
)
input_p = tokenizer_p . batch_encode_plus (
[ " This is a simple input 1 " , " This is a simple input 2 " ] ,
max_length = max_length ,
padding = True ,
)
2021-02-02 15:35:27 +08:00
self . assert_batch_padded_input_match ( input_r , input_p , len ( input_r [ " input_ids " ] [ 0 ] ) , pad_token_id )
2020-10-19 02:51:24 +08:00
input_r = tokenizer_r . batch_encode_plus (
[ " This is a simple input 1 " , " This is a simple input 2 " ] , padding = " longest "
)
input_p = tokenizer_p . batch_encode_plus (
[ " This is a simple input 1 " , " This is a simple input 2 " ] , padding = True
)
2021-02-02 15:35:27 +08:00
self . assert_batch_padded_input_match ( input_r , input_p , len ( input_r [ " input_ids " ] [ 0 ] ) , pad_token_id )
2020-10-19 02:51:24 +08:00
# Batch_encode_plus - Pair input
input_r = tokenizer_r . batch_encode_plus (
[
( " This is a simple input 1 " , " This is a simple input 2 " ) ,
( " This is a simple pair 1 " , " This is a simple pair 2 " ) ,
] ,
max_length = max_length ,
truncation = True ,
padding = " max_length " ,
)
input_p = tokenizer_p . batch_encode_plus (
[
( " This is a simple input 1 " , " This is a simple input 2 " ) ,
( " This is a simple pair 1 " , " This is a simple pair 2 " ) ,
] ,
max_length = max_length ,
truncation = True ,
padding = " max_length " ,
)
2021-02-02 15:35:27 +08:00
self . assert_batch_padded_input_match ( input_r , input_p , max_length , pad_token_id )
2020-10-19 02:51:24 +08:00
input_r = tokenizer_r . batch_encode_plus (
[
( " This is a simple input 1 " , " This is a simple input 2 " ) ,
( " This is a simple pair 1 " , " This is a simple pair 2 " ) ,
] ,
padding = True ,
)
input_p = tokenizer_p . batch_encode_plus (
[
( " This is a simple input 1 " , " This is a simple input 2 " ) ,
( " This is a simple pair 1 " , " This is a simple pair 2 " ) ,
] ,
padding = " longest " ,
)
2021-02-02 15:35:27 +08:00
self . assert_batch_padded_input_match ( input_r , input_p , len ( input_r [ " input_ids " ] [ 0 ] ) , pad_token_id )
2020-10-19 02:51:24 +08:00
# Using pad on single examples after tokenization
input_r = tokenizer_r . encode_plus ( " This is a input 1 " )
input_r = tokenizer_r . pad ( input_r )
2021-11-11 04:34:52 +08:00
input_p = tokenizer_p . encode_plus ( " This is a input 1 " )
input_p = tokenizer_p . pad ( input_p )
2020-10-19 02:51:24 +08:00
2021-02-02 15:35:27 +08:00
self . assert_padded_input_match (
input_r [ " input_ids " ] , input_p [ " input_ids " ] , len ( input_r [ " input_ids " ] ) , pad_token_id
)
2020-10-19 02:51:24 +08:00
# Using pad on single examples after tokenization
input_r = tokenizer_r . encode_plus ( " This is a input 1 " )
input_r = tokenizer_r . pad ( input_r , max_length = max_length , padding = " max_length " )
2021-11-11 04:34:52 +08:00
input_p = tokenizer_p . encode_plus ( " This is a input 1 " )
input_p = tokenizer_p . pad ( input_p , max_length = max_length , padding = " max_length " )
2020-10-19 02:51:24 +08:00
2021-02-02 15:35:27 +08:00
self . assert_padded_input_match ( input_r [ " input_ids " ] , input_p [ " input_ids " ] , max_length , pad_token_id )
2020-10-19 02:51:24 +08:00
# Using pad after tokenization
input_r = tokenizer_r . batch_encode_plus (
[ " This is a input 1 " , " This is a much longer input whilch should be padded " ]
)
input_r = tokenizer_r . pad ( input_r )
2021-11-11 04:34:52 +08:00
input_p = tokenizer_p . batch_encode_plus (
2020-10-19 02:51:24 +08:00
[ " This is a input 1 " , " This is a much longer input whilch should be padded " ]
)
2021-11-11 04:34:52 +08:00
input_p = tokenizer_p . pad ( input_p )
2020-10-19 02:51:24 +08:00
2021-02-02 15:35:27 +08:00
self . assert_batch_padded_input_match ( input_r , input_p , len ( input_r [ " input_ids " ] [ 0 ] ) , pad_token_id )
2020-10-19 02:51:24 +08:00
# Using pad after tokenization
input_r = tokenizer_r . batch_encode_plus (
[ " This is a input 1 " , " This is a much longer input whilch should be padded " ]
)
input_r = tokenizer_r . pad ( input_r , max_length = max_length , padding = " max_length " )
2021-11-11 04:34:52 +08:00
input_p = tokenizer_p . batch_encode_plus (
2020-10-19 02:51:24 +08:00
[ " This is a input 1 " , " This is a much longer input whilch should be padded " ]
)
2021-11-11 04:34:52 +08:00
input_p = tokenizer_p . pad ( input_p , max_length = max_length , padding = " max_length " )
self . assert_batch_padded_input_match ( input_r , input_p , max_length , pad_token_id )
2020-10-19 02:51:24 +08:00
2021-11-11 04:34:52 +08:00
# Test padding nested empty lists (in some use-cases, there is no any token id in the `input_ids` list).
input_r = tokenizer_r . pad ( { " input_ids " : [ [ ] , [ ] ] } , max_length = max_length , padding = " max_length " )
input_p = tokenizer_p . pad ( { " input_ids " : [ [ ] , [ ] ] } , max_length = max_length , padding = " max_length " )
2021-02-02 15:35:27 +08:00
self . assert_batch_padded_input_match ( input_r , input_p , max_length , pad_token_id )
def test_padding_different_model_input_name ( self ) :
2021-06-14 17:58:44 +08:00
if not self . test_slow_tokenizer :
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return
2021-02-02 15:35:27 +08:00
for tokenizer , pretrained_name , kwargs in self . tokenizers_list :
2021-03-31 22:00:27 +08:00
with self . subTest ( f " { tokenizer . __class__ . __name__ } ( { pretrained_name } ) " ) :
2021-02-02 15:35:27 +08:00
tokenizer_r = self . rust_tokenizer_class . from_pretrained ( pretrained_name , * * kwargs )
tokenizer_p = self . tokenizer_class . from_pretrained ( pretrained_name , * * kwargs )
self . assertEqual ( tokenizer_p . pad_token_id , tokenizer_r . pad_token_id )
pad_token_id = tokenizer_p . pad_token_id
input_r = tokenizer_r . batch_encode_plus (
[ " This is a input 1 " , " This is a much longer input whilch should be padded " ]
)
input_p = tokenizer_r . batch_encode_plus (
[ " This is a input 1 " , " This is a much longer input whilch should be padded " ]
)
# rename encoded batch to "inputs"
input_r [ " inputs " ] = input_r [ tokenizer_r . model_input_names [ 0 ] ]
del input_r [ tokenizer_r . model_input_names [ 0 ] ]
input_p [ " inputs " ] = input_p [ tokenizer_p . model_input_names [ 0 ] ]
del input_p [ tokenizer_p . model_input_names [ 0 ] ]
# Renaming `input_ids` to `inputs`
tokenizer_r . model_input_names = [ " inputs " ] + tokenizer_r . model_input_names [ 1 : ]
tokenizer_p . model_input_names = [ " inputs " ] + tokenizer_p . model_input_names [ 1 : ]
input_r = tokenizer_r . pad ( input_r , padding = " longest " )
input_p = tokenizer_r . pad ( input_p , padding = " longest " )
max_length = len ( input_p [ " inputs " ] [ 0 ] )
self . assert_batch_padded_input_match (
input_r , input_p , max_length , pad_token_id , model_main_input_name = " inputs "
)
2020-10-19 02:51:24 +08:00
def test_save_pretrained ( self ) :
2021-06-14 17:58:44 +08:00
if not self . test_slow_tokenizer :
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return
2020-10-19 02:51:24 +08:00
for tokenizer , pretrained_name , kwargs in self . tokenizers_list :
2021-03-31 22:00:27 +08:00
with self . subTest ( f " { tokenizer . __class__ . __name__ } ( { pretrained_name } ) " ) :
2020-10-19 02:51:24 +08:00
tokenizer_r = self . rust_tokenizer_class . from_pretrained ( pretrained_name , * * kwargs )
tokenizer_p = self . tokenizer_class . from_pretrained ( pretrained_name , * * kwargs )
tmpdirname2 = tempfile . mkdtemp ( )
tokenizer_r_files = tokenizer_r . save_pretrained ( tmpdirname2 )
tokenizer_p_files = tokenizer_p . save_pretrained ( tmpdirname2 )
2021-04-15 21:32:32 +08:00
# Checks it save with the same files + the tokenizer.json file for the fast one
self . assertTrue ( any ( " tokenizer.json " in f for f in tokenizer_r_files ) )
tokenizer_r_files = tuple ( f for f in tokenizer_r_files if " tokenizer.json " not in f )
2020-10-19 02:51:24 +08:00
self . assertSequenceEqual ( tokenizer_r_files , tokenizer_p_files )
# Checks everything loads correctly in the same way
tokenizer_rp = tokenizer_r . from_pretrained ( tmpdirname2 )
tokenizer_pp = tokenizer_p . from_pretrained ( tmpdirname2 )
# Check special tokens are set accordingly on Rust and Python
for key in tokenizer_pp . special_tokens_map :
self . assertTrue ( hasattr ( tokenizer_rp , key ) )
# self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key))
# self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id"))
shutil . rmtree ( tmpdirname2 )
2021-04-15 21:32:32 +08:00
# Save tokenizer rust, legacy_format=True
tmpdirname2 = tempfile . mkdtemp ( )
tokenizer_r_files = tokenizer_r . save_pretrained ( tmpdirname2 , legacy_format = True )
tokenizer_p_files = tokenizer_p . save_pretrained ( tmpdirname2 )
# Checks it save with the same files
self . assertSequenceEqual ( tokenizer_r_files , tokenizer_p_files )
# Checks everything loads correctly in the same way
tokenizer_rp = tokenizer_r . from_pretrained ( tmpdirname2 )
tokenizer_pp = tokenizer_p . from_pretrained ( tmpdirname2 )
# Check special tokens are set accordingly on Rust and Python
for key in tokenizer_pp . special_tokens_map :
self . assertTrue ( hasattr ( tokenizer_rp , key ) )
shutil . rmtree ( tmpdirname2 )
# Save tokenizer rust, legacy_format=False
tmpdirname2 = tempfile . mkdtemp ( )
tokenizer_r_files = tokenizer_r . save_pretrained ( tmpdirname2 , legacy_format = False )
tokenizer_p_files = tokenizer_p . save_pretrained ( tmpdirname2 )
# Checks it saved the tokenizer.json file
self . assertTrue ( any ( " tokenizer.json " in f for f in tokenizer_r_files ) )
# Checks everything loads correctly in the same way
tokenizer_rp = tokenizer_r . from_pretrained ( tmpdirname2 )
tokenizer_pp = tokenizer_p . from_pretrained ( tmpdirname2 )
# Check special tokens are set accordingly on Rust and Python
for key in tokenizer_pp . special_tokens_map :
self . assertTrue ( hasattr ( tokenizer_rp , key ) )
shutil . rmtree ( tmpdirname2 )
2020-10-19 02:51:24 +08:00
def test_embeded_special_tokens ( self ) :
2021-06-14 17:58:44 +08:00
if not self . test_slow_tokenizer :
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return
2020-10-19 02:51:24 +08:00
for tokenizer , pretrained_name , kwargs in self . tokenizers_list :
2021-03-31 22:00:27 +08:00
with self . subTest ( f " { tokenizer . __class__ . __name__ } ( { pretrained_name } ) " ) :
2020-10-19 02:51:24 +08:00
tokenizer_r = self . rust_tokenizer_class . from_pretrained ( pretrained_name , * * kwargs )
tokenizer_p = self . tokenizer_class . from_pretrained ( pretrained_name , * * kwargs )
sentence = " A, <mask> AllenNLP sentence. "
tokens_r = tokenizer_r . encode_plus (
sentence ,
add_special_tokens = True ,
)
tokens_p = tokenizer_p . encode_plus (
sentence ,
add_special_tokens = True ,
)
for key in tokens_p . keys ( ) :
self . assertEqual ( tokens_r [ key ] , tokens_p [ key ] )
if " token_type_ids " in tokens_r :
self . assertEqual ( sum ( tokens_r [ " token_type_ids " ] ) , sum ( tokens_p [ " token_type_ids " ] ) )
tokens_r = tokenizer_r . convert_ids_to_tokens ( tokens_r [ " input_ids " ] )
tokens_p = tokenizer_p . convert_ids_to_tokens ( tokens_p [ " input_ids " ] )
self . assertSequenceEqual ( tokens_r , tokens_p )
def test_compare_add_special_tokens ( self ) :
for tokenizer , pretrained_name , kwargs in self . tokenizers_list :
2021-03-31 22:00:27 +08:00
with self . subTest ( f " { tokenizer . __class__ . __name__ } ( { pretrained_name } ) " ) :
2020-10-19 02:51:24 +08:00
tokenizer_r = self . rust_tokenizer_class . from_pretrained ( pretrained_name , * * kwargs )
simple_num_special_tokens_to_add = tokenizer_r . num_special_tokens_to_add ( pair = False )
# pair_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=True)
for text in [ " " , " " ] :
# tokenize()
no_special_tokens = tokenizer_r . tokenize ( text , add_special_tokens = False )
with_special_tokens = tokenizer_r . tokenize ( text , add_special_tokens = True )
self . assertEqual (
len ( no_special_tokens ) , len ( with_special_tokens ) - simple_num_special_tokens_to_add
)
# encode()
no_special_tokens = tokenizer_r . encode ( text , add_special_tokens = False )
with_special_tokens = tokenizer_r . encode ( text , add_special_tokens = True )
self . assertEqual (
len ( no_special_tokens ) , len ( with_special_tokens ) - simple_num_special_tokens_to_add
)
# encode_plus()
no_special_tokens = tokenizer_r . encode_plus ( text , add_special_tokens = False )
with_special_tokens = tokenizer_r . encode_plus ( text , add_special_tokens = True )
for key in no_special_tokens . keys ( ) :
self . assertEqual (
len ( no_special_tokens [ key ] ) ,
len ( with_special_tokens [ key ] ) - simple_num_special_tokens_to_add ,
)
# # batch_encode_plus
no_special_tokens = tokenizer_r . batch_encode_plus ( [ text , text ] , add_special_tokens = False )
with_special_tokens = tokenizer_r . batch_encode_plus ( [ text , text ] , add_special_tokens = True )
for key in no_special_tokens . keys ( ) :
for i_no , i_with in zip ( no_special_tokens [ key ] , with_special_tokens [ key ] ) :
self . assertEqual ( len ( i_no ) , len ( i_with ) - simple_num_special_tokens_to_add )
def test_compare_prepare_for_model ( self ) :
2021-06-14 17:58:44 +08:00
if not self . test_slow_tokenizer :
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return
2020-10-19 02:51:24 +08:00
for tokenizer , pretrained_name , kwargs in self . tokenizers_list :
2021-03-31 22:00:27 +08:00
with self . subTest ( f " { tokenizer . __class__ . __name__ } ( { pretrained_name } ) " ) :
2020-10-19 02:51:24 +08:00
tokenizer_r = self . rust_tokenizer_class . from_pretrained ( pretrained_name , * * kwargs )
tokenizer_p = self . tokenizer_class . from_pretrained ( pretrained_name , * * kwargs )
string_sequence = " Asserting that both tokenizers are equal "
python_output = tokenizer_p . prepare_for_model (
tokenizer_p . encode ( string_sequence , add_special_tokens = False )
)
rust_output = tokenizer_r . prepare_for_model (
tokenizer_r . encode ( string_sequence , add_special_tokens = False )
)
for key in python_output :
self . assertEqual ( python_output [ key ] , rust_output [ key ] )
2021-04-23 21:17:37 +08:00
2021-05-04 20:13:57 +08:00
def test_special_tokens_initialization ( self ) :
for tokenizer , pretrained_name , kwargs in self . tokenizers_list :
with self . subTest ( f " { tokenizer . __class__ . __name__ } ( { pretrained_name } ) " ) :
added_tokens = [ AddedToken ( " <special> " , lstrip = True ) ]
tokenizer_r = self . rust_tokenizer_class . from_pretrained (
pretrained_name , additional_special_tokens = added_tokens , * * kwargs
)
r_output = tokenizer_r . encode ( " Hey this is a <special> token " )
special_token_id = tokenizer_r . encode ( " <special> " , add_special_tokens = False ) [ 0 ]
self . assertTrue ( special_token_id in r_output )
2021-06-14 17:58:44 +08:00
if self . test_slow_tokenizer :
tokenizer_cr = self . rust_tokenizer_class . from_pretrained (
pretrained_name , additional_special_tokens = added_tokens , * * kwargs , from_slow = True
)
tokenizer_p = self . tokenizer_class . from_pretrained (
pretrained_name , additional_special_tokens = added_tokens , * * kwargs
)
p_output = tokenizer_p . encode ( " Hey this is a <special> token " )
cr_output = tokenizer_cr . encode ( " Hey this is a <special> token " )
self . assertEqual ( p_output , r_output )
self . assertEqual ( cr_output , r_output )
self . assertTrue ( special_token_id in p_output )
self . assertTrue ( special_token_id in cr_output )
2021-05-04 20:13:57 +08:00
2021-08-23 20:35:18 +08:00
def test_special_tokens_initialization_with_non_empty_additional_special_tokens ( self ) :
tokenizer_list = [ ]
if self . test_slow_tokenizer :
tokenizer_list . append ( ( self . tokenizer_class , self . get_tokenizer ( ) ) )
if self . test_rust_tokenizer :
tokenizer_list . append ( ( self . rust_tokenizer_class , self . get_rust_tokenizer ( ) ) )
for tokenizer_class , tokenizer_utils in tokenizer_list :
with tempfile . TemporaryDirectory ( ) as tmp_dir :
tokenizer_utils . save_pretrained ( tmp_dir )
with open ( os . path . join ( tmp_dir , " special_tokens_map.json " ) , encoding = " utf-8 " ) as json_file :
special_tokens_map = json . load ( json_file )
with open ( os . path . join ( tmp_dir , " tokenizer_config.json " ) , encoding = " utf-8 " ) as json_file :
tokenizer_config = json . load ( json_file )
special_tokens_map [ " additional_special_tokens " ] = [ " an_additional_special_token " ]
tokenizer_config [ " additional_special_tokens " ] = [ " an_additional_special_token " ]
with open ( os . path . join ( tmp_dir , " special_tokens_map.json " ) , " w " , encoding = " utf-8 " ) as outfile :
json . dump ( special_tokens_map , outfile )
with open ( os . path . join ( tmp_dir , " tokenizer_config.json " ) , " w " , encoding = " utf-8 " ) as outfile :
json . dump ( tokenizer_config , outfile )
# the following checks allow us to verify that our test works as expected, i.e. that the tokenizer takes
# into account the new value of additional_special_tokens given in the "tokenizer_config.json" and
# "special_tokens_map.json" files
tokenizer_without_change_in_init = tokenizer_class . from_pretrained (
tmp_dir ,
)
self . assertIn (
" an_additional_special_token " , tokenizer_without_change_in_init . additional_special_tokens
)
self . assertIn ( " an_additional_special_token " , tokenizer_without_change_in_init . get_vocab ( ) )
self . assertEqual (
[ " an_additional_special_token " ] ,
tokenizer_without_change_in_init . convert_ids_to_tokens (
tokenizer_without_change_in_init . convert_tokens_to_ids ( [ " an_additional_special_token " ] )
) ,
)
# Now we test that we can change the value of additional_special_tokens in the from_pretrained
new_added_tokens = [ AddedToken ( " a_new_additional_special_token " , lstrip = True ) ]
tokenizer = tokenizer_class . from_pretrained (
tmp_dir ,
additional_special_tokens = new_added_tokens ,
)
self . assertIn ( " a_new_additional_special_token " , tokenizer . additional_special_tokens )
self . assertEqual (
[ " a_new_additional_special_token " ] ,
tokenizer . convert_ids_to_tokens (
tokenizer . convert_tokens_to_ids ( [ " a_new_additional_special_token " ] )
) ,
)
2021-06-30 03:00:08 +08:00
def test_training_new_tokenizer ( self ) :
# This feature only exists for fast tokenizers
if not self . test_rust_tokenizer :
return
tokenizer = self . get_rust_tokenizer ( )
new_tokenizer = tokenizer . train_new_from_iterator ( SMALL_TRAINING_CORPUS , 100 )
# Test we can use the new tokenizer with something not seen during training
inputs = new_tokenizer ( [ " This is the first sentence " , " This sentence is different 🤗. " ] )
self . assertEqual ( len ( inputs [ " input_ids " ] ) , 2 )
decoded_input = new_tokenizer . decode ( inputs [ " input_ids " ] [ 0 ] , skip_special_tokens = True )
expected_result = " This is the first sentence "
2021-07-01 18:37:07 +08:00
if tokenizer . backend_tokenizer . normalizer is not None :
expected_result = tokenizer . backend_tokenizer . normalizer . normalize_str ( expected_result )
2021-06-30 03:00:08 +08:00
self . assertEqual ( expected_result , decoded_input )
# We check that the parameters of the tokenizer remained the same
# Check we have the same number of added_tokens for both pair and non-pair inputs.
self . assertEqual ( tokenizer . num_special_tokens_to_add ( False ) , new_tokenizer . num_special_tokens_to_add ( False ) )
self . assertEqual ( tokenizer . num_special_tokens_to_add ( True ) , new_tokenizer . num_special_tokens_to_add ( True ) )
# Check we have the correct max_length for both pair and non-pair inputs.
self . assertEqual ( tokenizer . max_len_single_sentence , new_tokenizer . max_len_single_sentence )
self . assertEqual ( tokenizer . max_len_sentences_pair , new_tokenizer . max_len_sentences_pair )
# Assert the set of special tokens match as we didn't ask to change them
self . assertSequenceEqual (
tokenizer . all_special_tokens_extended ,
new_tokenizer . all_special_tokens_extended ,
)
self . assertDictEqual ( tokenizer . special_tokens_map , new_tokenizer . special_tokens_map )
def test_training_new_tokenizer_with_special_tokens_change ( self ) :
# This feature only exists for fast tokenizers
if not self . test_rust_tokenizer :
return
tokenizer = self . get_rust_tokenizer ( )
# Test with a special tokens map
class_signature = inspect . signature ( tokenizer . __class__ )
if " cls_token " in class_signature . parameters :
new_tokenizer = tokenizer . train_new_from_iterator (
SMALL_TRAINING_CORPUS , 100 , special_tokens_map = { tokenizer . cls_token : " <cls> " }
)
cls_id = new_tokenizer . get_vocab ( ) [ " <cls> " ]
self . assertEqual ( new_tokenizer . cls_token , " <cls> " )
self . assertEqual ( new_tokenizer . cls_token_id , cls_id )
# Create a new mapping from the special tokens defined in the original tokenizer
special_tokens_list = SpecialTokensMixin . SPECIAL_TOKENS_ATTRIBUTES . copy ( )
special_tokens_list . remove ( " additional_special_tokens " )
special_tokens_map = { }
for token in special_tokens_list :
# Get the private one to avoid unnecessary warnings.
if getattr ( tokenizer , f " _ { token } " ) is not None :
special_token = getattr ( tokenizer , token )
special_tokens_map [ special_token ] = f " { special_token } a "
# Train new tokenizer
new_tokenizer = tokenizer . train_new_from_iterator (
SMALL_TRAINING_CORPUS , 100 , special_tokens_map = special_tokens_map
)
# Check the changes
for token in special_tokens_list :
# Get the private one to avoid unnecessary warnings.
if getattr ( tokenizer , f " _ { token } " ) is None :
continue
special_token = getattr ( tokenizer , token )
if special_token in special_tokens_map :
new_special_token = getattr ( new_tokenizer , token )
self . assertEqual ( special_tokens_map [ special_token ] , new_special_token )
new_id = new_tokenizer . get_vocab ( ) [ new_special_token ]
self . assertEqual ( getattr ( new_tokenizer , f " { token } _id " ) , new_id )
# Check if the AddedToken / string format has been kept
for special_token in tokenizer . all_special_tokens_extended :
if isinstance ( special_token , AddedToken ) and special_token . content not in special_tokens_map :
# The special token must appear identically in the list of the new tokenizer.
self . assertTrue (
special_token in new_tokenizer . all_special_tokens_extended ,
f " ' { special_token } ' should be in { new_tokenizer . all_special_tokens_extended } " ,
)
elif isinstance ( special_token , AddedToken ) :
# The special token must appear in the list of the new tokenizer as an object of type AddedToken with
# the same parameters as the old AddedToken except the content that the user has requested to change.
special_token_str = special_token . content
new_special_token_str = special_tokens_map [ special_token_str ]
find = False
for candidate in new_tokenizer . all_special_tokens_extended :
if (
isinstance ( candidate , AddedToken )
and candidate . content == new_special_token_str
and candidate . lstrip == special_token . lstrip
and candidate . rstrip == special_token . rstrip
and candidate . normalized == special_token . normalized
and candidate . single_word == special_token . single_word
) :
find = True
break
self . assertTrue (
find ,
(
f " ' { new_special_token_str } ' doesn ' t appear in the list "
f " ' { new_tokenizer . all_special_tokens_extended } ' as an AddedToken with the same parameters as "
f " ' { special_token } ' in the list { tokenizer . all_special_tokens_extended } "
) ,
)
elif special_token not in special_tokens_map :
# The special token must appear identically in the list of the new tokenizer.
self . assertTrue (
special_token in new_tokenizer . all_special_tokens_extended ,
f " ' { special_token } ' should be in { new_tokenizer . all_special_tokens_extended } " ,
)
else :
# The special token must appear in the list of the new tokenizer as an object of type string.
self . assertTrue ( special_tokens_map [ special_token ] in new_tokenizer . all_special_tokens_extended )
# Test we can use the new tokenizer with something not seen during training
inputs = new_tokenizer ( [ " This is the first sentence " , " This sentence is different 🤗. " ] )
self . assertEqual ( len ( inputs [ " input_ids " ] ) , 2 )
decoded_input = new_tokenizer . decode ( inputs [ " input_ids " ] [ 0 ] , skip_special_tokens = True )
expected_result = " This is the first sentence "
2021-07-01 18:37:07 +08:00
if tokenizer . backend_tokenizer . normalizer is not None :
expected_result = tokenizer . backend_tokenizer . normalizer . normalize_str ( expected_result )
2021-06-30 03:00:08 +08:00
self . assertEqual ( expected_result , decoded_input )
2021-07-17 21:52:21 +08:00
def test_tokenizer_mismatch_warning ( self ) :
for tokenizer , pretrained_name , kwargs in self . tokenizers_list :
with self . subTest ( f " { tokenizer . __class__ . __name__ } ( { pretrained_name } ) " ) :
with self . assertLogs ( " transformers " , level = " WARNING " ) as cm :
try :
if self . tokenizer_class == BertTokenizer :
AlbertTokenizer . from_pretrained ( pretrained_name )
else :
BertTokenizer . from_pretrained ( pretrained_name )
2022-02-01 23:48:25 +08:00
except EnvironmentError as e :
# Some tokenizer will raised an error before reaching the logged warning because there are no
# corresponding files to load
error_message = str ( e )
2021-07-17 21:52:21 +08:00
except ( TypeError , AttributeError ) :
# Some tokenizers cannot be loaded into the target tokenizer at all and errors are returned,
# here we just check that the warning has been logged before the error is raised
pass
finally :
2022-02-01 23:48:25 +08:00
logged_msg_target = (
" The tokenizer class you load from this checkpoint is not the same type as the class "
" this function is called from. "
)
raised_error_msg_target = " Can ' t load tokenizer for "
2021-07-17 21:52:21 +08:00
self . assertTrue (
2022-02-01 23:48:25 +08:00
cm . records [ 0 ] . message . startswith ( logged_msg_target )
if len ( cm . records ) > 0
else False or raised_error_msg_target in error_message
2021-07-17 21:52:21 +08:00
)
try :
if self . rust_tokenizer_class == BertTokenizerFast :
AlbertTokenizerFast . from_pretrained ( pretrained_name )
else :
BertTokenizerFast . from_pretrained ( pretrained_name )
except ( TypeError , AttributeError ) :
# Some tokenizers cannot be loaded into the target tokenizer at all and errors are returned,
# here we just check that the warning has been logged before the error is raised
pass
finally :
self . assertTrue (
cm . records [ 0 ] . message . startswith (
" The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. "
)
)
2021-09-01 22:32:56 +08:00
@require_torch
def test_saving_tokenizer_trainer ( self ) :
for tokenizer , pretrained_name , kwargs in self . tokenizers_list :
with self . subTest ( f " { tokenizer . __class__ . __name__ } ( { pretrained_name } ) " ) :
with tempfile . TemporaryDirectory ( ) as tmp_dir :
# Save the fast tokenizer files in a temporary directory
tokenizer_old = self . rust_tokenizer_class . from_pretrained ( pretrained_name , * * kwargs , use_fast = True )
tokenizer_old . save_pretrained ( tmp_dir , legacy_format = False ) # save only fast version
# Initialize toy model for the trainer
model = nn . Module ( )
# Load tokenizer from a folder without legacy files
tokenizer = self . rust_tokenizer_class . from_pretrained ( tmp_dir )
training_args = TrainingArguments ( output_dir = tmp_dir , do_train = True , no_cuda = True )
trainer = Trainer ( model = model , args = training_args , tokenizer = tokenizer )
# Should not raise an error
trainer . save_model ( os . path . join ( tmp_dir , " checkpoint " ) )
self . assertIn ( " tokenizer.json " , os . listdir ( os . path . join ( tmp_dir , " checkpoint " ) ) )
2022-02-01 23:48:25 +08:00
def test_save_slow_from_fast_and_reload_fast ( self ) :
if not self . test_slow_tokenizer or not self . test_rust_tokenizer :
# we need both slow and fast versions
return
for tokenizer , pretrained_name , kwargs in self . tokenizers_list :
with self . subTest ( f " { tokenizer . __class__ . __name__ } ( { pretrained_name } ) " ) :
with tempfile . TemporaryDirectory ( ) as tmp_dir_1 :
# Here we check that even if we have initialized a fast tokenizer with a tokenizer_file we can
# still save only the slow version and use these saved files to rebuild a tokenizer
tokenizer_fast_old_1 = self . rust_tokenizer_class . from_pretrained (
pretrained_name , * * kwargs , use_fast = True
)
tokenizer_file = os . path . join ( tmp_dir_1 , " tokenizer.json " )
tokenizer_fast_old_1 . backend_tokenizer . save ( tokenizer_file )
tokenizer_fast_old_2 = self . rust_tokenizer_class . from_pretrained (
pretrained_name , * * kwargs , use_fast = True , tokenizer_file = tokenizer_file
)
tokenizer_fast_old_2 . save_pretrained ( tmp_dir_1 , legacy_format = True ) # save only slow version
tokenizer_slow = self . tokenizer_class . from_pretrained ( tmp_dir_1 )
with tempfile . TemporaryDirectory ( ) as tmp_dir_2 :
tokenizer_slow . save_pretrained ( tmp_dir_2 )
# Should not raise an error
self . rust_tokenizer_class . from_pretrained ( tmp_dir_2 )
2021-04-23 21:17:37 +08:00
2022-03-24 03:56:49 +08:00
class TokenizerUtilTester ( unittest . TestCase ) :
def test_cached_files_are_used_when_internet_is_down ( self ) :
# A mock response for an HTTP head request to emulate server down
response_mock = mock . Mock ( )
response_mock . status_code = 500
response_mock . headers = [ ]
response_mock . raise_for_status . side_effect = HTTPError
# Download this model to make sure it's in the cache.
_ = BertTokenizer . from_pretrained ( " hf-internal-testing/tiny-random-bert " )
# Under the mock environment we get a 500 error when trying to reach the model.
with mock . patch ( " transformers.utils.hub.requests.head " , return_value = response_mock ) as mock_head :
_ = BertTokenizer . from_pretrained ( " hf-internal-testing/tiny-random-bert " )
# This check we did call the fake head request
mock_head . assert_called ( )
2021-04-23 21:17:37 +08:00
@is_staging_test
2021-04-26 23:52:23 +08:00
class TokenizerPushToHubTester ( unittest . TestCase ) :
2021-04-23 21:17:37 +08:00
vocab_tokens = [ " [UNK] " , " [CLS] " , " [SEP] " , " [PAD] " , " [MASK] " , " bla " , " blou " ]
@classmethod
def setUpClass ( cls ) :
2021-11-03 06:58:42 +08:00
cls . _token = login ( username = USER , password = PASS )
2021-04-23 21:17:37 +08:00
@classmethod
def tearDownClass ( cls ) :
try :
2021-11-03 06:58:42 +08:00
delete_repo ( token = cls . _token , name = " test-tokenizer " )
2021-04-23 21:17:37 +08:00
except HTTPError :
pass
try :
2021-11-03 06:58:42 +08:00
delete_repo ( token = cls . _token , name = " test-tokenizer-org " , organization = " valid_org " )
2021-04-23 21:17:37 +08:00
except HTTPError :
pass
2021-11-09 04:28:25 +08:00
try :
delete_repo ( token = cls . _token , name = " test-dynamic-tokenizer " )
except HTTPError :
pass
2021-04-23 21:17:37 +08:00
def test_push_to_hub ( self ) :
with tempfile . TemporaryDirectory ( ) as tmp_dir :
vocab_file = os . path . join ( tmp_dir , " vocab.txt " )
with open ( vocab_file , " w " , encoding = " utf-8 " ) as vocab_writer :
vocab_writer . write ( " " . join ( [ x + " \n " for x in self . vocab_tokens ] ) )
tokenizer = BertTokenizer ( vocab_file )
2021-04-26 23:52:23 +08:00
tokenizer . save_pretrained (
2021-06-23 22:11:19 +08:00
os . path . join ( tmp_dir , " test-tokenizer " ) , push_to_hub = True , use_auth_token = self . _token
2021-04-26 23:52:23 +08:00
)
2021-04-23 21:17:37 +08:00
2021-04-26 23:52:23 +08:00
new_tokenizer = BertTokenizer . from_pretrained ( f " { USER } /test-tokenizer " )
2021-04-23 21:17:37 +08:00
self . assertDictEqual ( new_tokenizer . vocab , tokenizer . vocab )
def test_push_to_hub_in_organization ( self ) :
with tempfile . TemporaryDirectory ( ) as tmp_dir :
vocab_file = os . path . join ( tmp_dir , " vocab.txt " )
with open ( vocab_file , " w " , encoding = " utf-8 " ) as vocab_writer :
vocab_writer . write ( " " . join ( [ x + " \n " for x in self . vocab_tokens ] ) )
tokenizer = BertTokenizer ( vocab_file )
tokenizer . save_pretrained (
2021-06-23 22:11:19 +08:00
os . path . join ( tmp_dir , " test-tokenizer-org " ) ,
2021-04-23 21:17:37 +08:00
push_to_hub = True ,
use_auth_token = self . _token ,
organization = " valid_org " ,
)
2021-04-26 23:52:23 +08:00
new_tokenizer = BertTokenizer . from_pretrained ( " valid_org/test-tokenizer-org " )
2021-04-23 21:17:37 +08:00
self . assertDictEqual ( new_tokenizer . vocab , tokenizer . vocab )
2021-09-09 23:26:16 +08:00
2022-02-02 23:44:37 +08:00
@require_tokenizers
2021-11-09 04:28:25 +08:00
def test_push_to_hub_dynamic_tokenizer ( self ) :
2022-02-02 23:44:37 +08:00
CustomTokenizer . register_for_auto_class ( )
2021-11-09 04:28:25 +08:00
with tempfile . TemporaryDirectory ( ) as tmp_dir :
vocab_file = os . path . join ( tmp_dir , " vocab.txt " )
with open ( vocab_file , " w " , encoding = " utf-8 " ) as vocab_writer :
vocab_writer . write ( " " . join ( [ x + " \n " for x in self . vocab_tokens ] ) )
2022-02-02 23:44:37 +08:00
tokenizer = CustomTokenizer ( vocab_file )
2021-11-09 04:28:25 +08:00
# No fast custom tokenizer
with tempfile . TemporaryDirectory ( ) as tmp_dir :
repo = Repository ( tmp_dir , clone_from = f " { USER } /test-dynamic-tokenizer " , use_auth_token = self . _token )
tokenizer . save_pretrained ( tmp_dir )
2022-02-02 23:44:37 +08:00
with open ( os . path . join ( tmp_dir , " tokenizer_config.json " ) ) as f :
tokenizer_config = json . load ( f )
2022-02-15 22:44:35 +08:00
self . assertDictEqual (
tokenizer_config [ " auto_map " ] , { " AutoTokenizer " : [ " custom_tokenization.CustomTokenizer " , None ] }
)
2021-11-09 04:28:25 +08:00
repo . push_to_hub ( )
tokenizer = AutoTokenizer . from_pretrained ( f " { USER } /test-dynamic-tokenizer " , trust_remote_code = True )
2022-02-02 23:44:37 +08:00
# Can't make an isinstance check because the new_model.config is from the CustomTokenizer class of a dynamic module
self . assertEqual ( tokenizer . __class__ . __name__ , " CustomTokenizer " )
2021-11-09 04:28:25 +08:00
# Fast and slow custom tokenizer
2022-02-02 23:44:37 +08:00
CustomTokenizerFast . register_for_auto_class ( )
with tempfile . TemporaryDirectory ( ) as tmp_dir :
vocab_file = os . path . join ( tmp_dir , " vocab.txt " )
with open ( vocab_file , " w " , encoding = " utf-8 " ) as vocab_writer :
vocab_writer . write ( " " . join ( [ x + " \n " for x in self . vocab_tokens ] ) )
bert_tokenizer = BertTokenizerFast . from_pretrained ( tmp_dir )
bert_tokenizer . save_pretrained ( tmp_dir )
tokenizer = CustomTokenizerFast . from_pretrained ( tmp_dir )
2021-11-09 04:28:25 +08:00
with tempfile . TemporaryDirectory ( ) as tmp_dir :
repo = Repository ( tmp_dir , clone_from = f " { USER } /test-dynamic-tokenizer " , use_auth_token = self . _token )
tokenizer . save_pretrained ( tmp_dir )
2022-02-02 23:44:37 +08:00
with open ( os . path . join ( tmp_dir , " tokenizer_config.json " ) ) as f :
tokenizer_config = json . load ( f )
2022-02-15 22:44:35 +08:00
self . assertDictEqual (
2022-02-02 23:44:37 +08:00
tokenizer_config [ " auto_map " ] ,
2022-02-15 22:44:35 +08:00
{
" AutoTokenizer " : [
" custom_tokenization.CustomTokenizer " ,
" custom_tokenization_fast.CustomTokenizerFast " ,
]
} ,
2022-02-02 23:44:37 +08:00
)
2021-11-09 04:28:25 +08:00
repo . push_to_hub ( )
tokenizer = AutoTokenizer . from_pretrained ( f " { USER } /test-dynamic-tokenizer " , trust_remote_code = True )
# Can't make an isinstance check because the new_model.config is from the FakeConfig class of a dynamic module
2022-02-02 23:44:37 +08:00
self . assertEqual ( tokenizer . __class__ . __name__ , " CustomTokenizerFast " )
2021-11-09 04:28:25 +08:00
tokenizer = AutoTokenizer . from_pretrained (
f " { USER } /test-dynamic-tokenizer " , use_fast = False , trust_remote_code = True
)
# Can't make an isinstance check because the new_model.config is from the FakeConfig class of a dynamic module
2022-02-02 23:44:37 +08:00
self . assertEqual ( tokenizer . __class__ . __name__ , " CustomTokenizer " )
2021-11-09 04:28:25 +08:00
2021-09-09 23:26:16 +08:00
class TrieTest ( unittest . TestCase ) :
def test_trie ( self ) :
trie = Trie ( )
trie . add ( " Hello 友達 " )
self . assertEqual ( trie . data , { " H " : { " e " : { " l " : { " l " : { " o " : { " " : { " 友 " : { " 達 " : { " " : 1 } } } } } } } } } )
trie . add ( " Hello " )
trie . data
self . assertEqual ( trie . data , { " H " : { " e " : { " l " : { " l " : { " o " : { " " : 1 , " " : { " 友 " : { " 達 " : { " " : 1 } } } } } } } } } )
def test_trie_split ( self ) :
trie = Trie ( )
self . assertEqual ( trie . split ( " [CLS] This is a extra_id_100 " ) , [ " [CLS] This is a extra_id_100 " ] )
trie . add ( " [CLS] " )
trie . add ( " extra_id_1 " )
trie . add ( " extra_id_100 " )
self . assertEqual ( trie . split ( " [CLS] This is a extra_id_100 " ) , [ " [CLS] " , " This is a " , " extra_id_100 " ] )
2021-10-05 18:26:54 +08:00
def test_trie_single ( self ) :
trie = Trie ( )
trie . add ( " A " )
self . assertEqual ( trie . split ( " ABC " ) , [ " A " , " BC " ] )
self . assertEqual ( trie . split ( " BCA " ) , [ " BC " , " A " ] )
def test_trie_final ( self ) :
trie = Trie ( )
trie . add ( " TOKEN] " )
trie . add ( " [SPECIAL_TOKEN] " )
self . assertEqual ( trie . split ( " This is something [SPECIAL_TOKEN] " ) , [ " This is something " , " [SPECIAL_TOKEN] " ] )
2021-10-08 16:10:38 +08:00
def test_trie_subtokens ( self ) :
trie = Trie ( )
trie . add ( " A " )
trie . add ( " P " )
trie . add ( " [SPECIAL_TOKEN] " )
self . assertEqual ( trie . split ( " This is something [SPECIAL_TOKEN] " ) , [ " This is something " , " [SPECIAL_TOKEN] " ] )
def test_trie_suffix_tokens ( self ) :
trie = Trie ( )
trie . add ( " AB " )
trie . add ( " B " )
trie . add ( " C " )
self . assertEqual ( trie . split ( " ABC " ) , [ " AB " , " C " ] )
2021-12-30 16:10:34 +08:00
def test_trie_skip ( self ) :
trie = Trie ( )
trie . add ( " ABC " )
trie . add ( " B " )
trie . add ( " CD " )
self . assertEqual ( trie . split ( " ABCD " ) , [ " ABC " , " D " ] )
2021-10-08 16:10:38 +08:00
def test_cut_text_hardening ( self ) :
# Even if the offsets are wrong, we necessarily output correct string
# parts.
trie = Trie ( )
parts = trie . cut_text ( " ABC " , [ 0 , 0 , 2 , 1 , 2 , 3 ] )
self . assertEqual ( parts , [ " AB " , " C " ] )