2019-07-05 17:20:27 +08:00
# coding=utf-8
# Copyright 2019 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
2019-12-22 23:20:32 +08:00
2019-07-05 17:20:27 +08:00
2019-07-09 21:58:58 +08:00
import os
2019-12-23 01:12:11 +08:00
import pickle
2020-06-16 05:12:51 +08:00
import re
2019-12-21 22:57:32 +08:00
import shutil
2019-07-09 16:25:18 +08:00
import tempfile
2020-04-09 04:22:44 +08:00
from collections import OrderedDict
2020-06-25 03:53:08 +08:00
from typing import TYPE_CHECKING , Dict , List , Tuple , Union
2019-12-21 22:57:32 +08:00
2020-06-25 03:53:08 +08:00
from transformers import PreTrainedTokenizer , PreTrainedTokenizerBase , PreTrainedTokenizerFast
2020-07-01 22:31:17 +08:00
from transformers . testing_utils import require_tf , require_torch , slow
2020-07-07 06:27:53 +08:00
from transformers . tokenization_utils import AddedToken
2020-02-25 01:09:46 +08:00
2019-07-05 17:20:27 +08:00
2020-05-15 01:14:26 +08:00
if TYPE_CHECKING :
2020-08-24 23:03:01 +08:00
from transformers import PretrainedConfig , PreTrainedModel , TFPreTrainedModel
2020-05-15 01:14:26 +08:00
2020-04-09 04:22:44 +08:00
def merge_model_tokenizer_mappings (
2020-04-09 21:09:00 +08:00
model_mapping : Dict [ " PretrainedConfig " , Union [ " PreTrainedModel " , " TFPreTrainedModel " ] ] ,
tokenizer_mapping : Dict [ " PretrainedConfig " , Tuple [ " PreTrainedTokenizer " , " PreTrainedTokenizerFast " ] ] ,
) - > Dict [
Union [ " PreTrainedTokenizer " , " PreTrainedTokenizerFast " ] ,
Tuple [ " PretrainedConfig " , Union [ " PreTrainedModel " , " TFPreTrainedModel " ] ] ,
] :
2020-04-09 04:22:44 +08:00
configurations = list ( model_mapping . keys ( ) )
model_tokenizer_mapping = OrderedDict ( [ ] )
for configuration in configurations :
model = model_mapping [ configuration ]
tokenizer = tokenizer_mapping [ configuration ] [ 0 ]
tokenizer_fast = tokenizer_mapping [ configuration ] [ 1 ]
model_tokenizer_mapping . update ( { tokenizer : ( configuration , model ) } )
if tokenizer_fast is not None :
model_tokenizer_mapping . update ( { tokenizer_fast : ( configuration , model ) } )
return model_tokenizer_mapping
2019-12-22 22:34:15 +08:00
class TokenizerTesterMixin :
2019-07-05 17:20:27 +08:00
2019-12-22 22:34:15 +08:00
tokenizer_class = None
2019-12-25 02:29:01 +08:00
test_rust_tokenizer = False
2019-07-05 17:20:27 +08:00
2019-12-22 22:34:15 +08:00
def setUp ( self ) :
self . tmpdirname = tempfile . mkdtemp ( )
2019-07-05 17:20:27 +08:00
2019-12-22 22:34:15 +08:00
def tearDown ( self ) :
shutil . rmtree ( self . tmpdirname )
2019-07-05 17:20:27 +08:00
2020-06-16 05:12:51 +08:00
def get_input_output_texts ( self , tokenizer ) :
input_txt = self . get_clean_sequence ( tokenizer ) [ 0 ]
return input_txt , input_txt
2020-06-25 23:24:28 +08:00
def get_clean_sequence ( self , tokenizer , with_prefix_space = False , max_length = 20 ) - > Tuple [ str , list ] :
2020-06-16 05:12:51 +08:00
toks = [ ( i , tokenizer . decode ( [ i ] , clean_up_tokenization_spaces = False ) ) for i in range ( len ( tokenizer ) ) ]
toks = list ( filter ( lambda t : re . match ( r " ^[ a-zA-Z]+$ " , t [ 1 ] ) , toks ) )
toks = list ( filter ( lambda t : [ t [ 0 ] ] == tokenizer . encode ( t [ 1 ] , add_special_tokens = False ) , toks ) )
if max_length is not None and len ( toks ) > max_length :
toks = toks [ : max_length ]
# toks_str = [t[1] for t in toks]
toks_ids = [ t [ 0 ] for t in toks ]
# Ensure consistency
output_txt = tokenizer . decode ( toks_ids , clean_up_tokenization_spaces = False )
if " " not in output_txt and len ( toks_ids ) > 1 :
output_txt = (
tokenizer . decode ( [ toks_ids [ 0 ] ] , clean_up_tokenization_spaces = False )
+ " "
+ tokenizer . decode ( toks_ids [ 1 : ] , clean_up_tokenization_spaces = False )
)
if with_prefix_space :
output_txt = " " + output_txt
output_ids = tokenizer . encode ( output_txt , add_special_tokens = False )
return output_txt , output_ids
2020-06-25 03:53:08 +08:00
def get_tokenizers ( self , fast = True , * * kwargs ) - > List [ PreTrainedTokenizerBase ] :
2020-06-16 05:12:51 +08:00
if fast and self . test_rust_tokenizer :
return [ self . get_tokenizer ( * * kwargs ) , self . get_rust_tokenizer ( * * kwargs ) ]
return [ self . get_tokenizer ( * * kwargs ) ]
2020-05-19 22:46:55 +08:00
def get_tokenizer ( self , * * kwargs ) - > PreTrainedTokenizer :
return self . tokenizer_class . from_pretrained ( self . tmpdirname , * * kwargs )
2019-07-05 17:20:27 +08:00
2020-06-25 03:53:08 +08:00
def get_rust_tokenizer ( self , * * kwargs ) - > PreTrainedTokenizerFast :
2019-12-25 02:29:01 +08:00
raise NotImplementedError
2019-07-05 17:20:27 +08:00
2020-06-16 05:12:51 +08:00
# def get_input_output_texts(self) -> Tuple[str, str]:
# """Feel free to overwrite"""
# # TODO: @property
# return (
# "This is a test",
# "This is a test",
# )
2019-07-09 21:58:58 +08:00
2020-02-25 01:09:46 +08:00
@staticmethod
def convert_batch_encode_plus_format_to_encode_plus ( batch_encode_plus_sequences ) :
# Switch from batch_encode_plus format: {'input_ids': [[...], [...]], ...}
2020-05-19 22:46:55 +08:00
# to the list of examples/ encode_plus format: [{'input_ids': [...], ...}, {'input_ids': [...], ...}]
2020-02-25 01:09:46 +08:00
return [
{ value : batch_encode_plus_sequences [ value ] [ i ] for value in batch_encode_plus_sequences . keys ( ) }
2020-03-10 01:48:58 +08:00
for i in range ( len ( batch_encode_plus_sequences [ " input_ids " ] ) )
2020-02-25 01:09:46 +08:00
]
2019-12-22 22:34:15 +08:00
def test_tokenizers_common_properties ( self ) :
2020-06-16 05:12:51 +08:00
tokenizers = self . get_tokenizers ( )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
attributes_list = [
" bos_token " ,
" eos_token " ,
" unk_token " ,
" sep_token " ,
" pad_token " ,
" cls_token " ,
" mask_token " ,
]
for attr in attributes_list :
self . assertTrue ( hasattr ( tokenizer , attr ) )
self . assertTrue ( hasattr ( tokenizer , attr + " _id " ) )
self . assertTrue ( hasattr ( tokenizer , " additional_special_tokens " ) )
self . assertTrue ( hasattr ( tokenizer , " additional_special_tokens_ids " ) )
attributes_list = [
" model_max_length " ,
" init_inputs " ,
" init_kwargs " ,
]
if not isinstance ( tokenizer , PreTrainedTokenizerFast ) :
attributes_list + = [
" added_tokens_encoder " ,
" added_tokens_decoder " ,
]
for attr in attributes_list :
self . assertTrue ( hasattr ( tokenizer , attr ) )
2019-09-06 03:31:29 +08:00
2019-12-22 22:34:15 +08:00
def test_save_and_load_tokenizer ( self ) :
# safety check on max_len default value so we are sure the test works
2020-06-25 03:53:08 +08:00
tokenizers = self . get_tokenizers ( )
2020-06-16 05:12:51 +08:00
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
self . assertNotEqual ( tokenizer . max_len , 42 )
2019-08-30 23:09:36 +08:00
2019-12-22 22:34:15 +08:00
# Now let's start the test
2020-06-25 03:53:08 +08:00
tokenizers = self . get_tokenizers ( )
2020-06-16 05:12:51 +08:00
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
2020-06-25 03:53:08 +08:00
# Isolate this from the other tests because we save additional tokens/etc
tmpdirname = tempfile . mkdtemp ( )
sample_text = " He is very happy, UNwant \u00E9 d,running "
2020-06-16 05:12:51 +08:00
before_tokens = tokenizer . encode ( sample_text , add_special_tokens = False )
2020-06-25 03:53:08 +08:00
before_vocab = tokenizer . get_vocab ( )
tokenizer . save_pretrained ( tmpdirname )
after_tokenizer = tokenizer . __class__ . from_pretrained ( tmpdirname )
after_tokens = after_tokenizer . encode ( sample_text , add_special_tokens = False )
after_vocab = after_tokenizer . get_vocab ( )
self . assertListEqual ( before_tokens , after_tokens )
self . assertDictEqual ( before_vocab , after_vocab )
shutil . rmtree ( tmpdirname )
2019-07-05 17:20:27 +08:00
2020-06-25 03:53:08 +08:00
# Now let's start the test
tokenizers = self . get_tokenizers ( model_max_length = 42 )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
# Isolate this from the other tests because we save additional tokens/etc
tmpdirname = tempfile . mkdtemp ( )
sample_text = " He is very happy, UNwant \u00E9 d,running "
tokenizer . add_tokens ( [ " bim " , " bambam " ] )
additional_special_tokens = tokenizer . additional_special_tokens
additional_special_tokens . append ( " new_additional_special_token " )
tokenizer . add_special_tokens ( { " additional_special_tokens " : additional_special_tokens } )
before_tokens = tokenizer . encode ( sample_text , add_special_tokens = False )
before_vocab = tokenizer . get_vocab ( )
tokenizer . save_pretrained ( tmpdirname )
2019-07-05 17:20:27 +08:00
2020-06-25 03:53:08 +08:00
after_tokenizer = tokenizer . __class__ . from_pretrained ( tmpdirname )
after_tokens = after_tokenizer . encode ( sample_text , add_special_tokens = False )
after_vocab = after_tokenizer . get_vocab ( )
2020-06-16 05:12:51 +08:00
self . assertListEqual ( before_tokens , after_tokens )
2020-06-25 03:53:08 +08:00
self . assertDictEqual ( before_vocab , after_vocab )
self . assertIn ( " bim " , after_vocab )
self . assertIn ( " bambam " , after_vocab )
self . assertIn ( " new_additional_special_token " , after_tokenizer . additional_special_tokens )
self . assertEqual ( after_tokenizer . model_max_length , 42 )
2019-08-30 23:09:36 +08:00
2020-06-25 03:53:08 +08:00
tokenizer = tokenizer . __class__ . from_pretrained ( tmpdirname , model_max_length = 43 )
2020-06-16 05:12:51 +08:00
self . assertEqual ( tokenizer . model_max_length , 43 )
2019-07-05 17:20:27 +08:00
2020-06-25 03:53:08 +08:00
shutil . rmtree ( tmpdirname )
2019-12-22 22:34:15 +08:00
def test_pickle_tokenizer ( self ) :
2020-05-19 22:46:55 +08:00
""" Google pickle __getstate__ __setstate__ if you are struggling with this. """
2020-06-16 05:12:51 +08:00
tokenizers = self . get_tokenizers ( )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
self . assertIsNotNone ( tokenizer )
2019-07-05 17:20:27 +08:00
2020-06-16 05:12:51 +08:00
text = " Munich and Berlin are nice cities "
subwords = tokenizer . tokenize ( text )
2019-07-09 16:25:18 +08:00
2020-06-16 05:12:51 +08:00
filename = os . path . join ( self . tmpdirname , " tokenizer.bin " )
with open ( filename , " wb " ) as handle :
pickle . dump ( tokenizer , handle )
2019-07-09 16:25:18 +08:00
2020-06-16 05:12:51 +08:00
with open ( filename , " rb " ) as handle :
tokenizer_new = pickle . load ( handle )
2019-07-09 16:25:18 +08:00
2020-06-16 05:12:51 +08:00
subwords_loaded = tokenizer_new . tokenize ( text )
2019-07-09 16:25:18 +08:00
2020-06-16 05:12:51 +08:00
self . assertListEqual ( subwords , subwords_loaded )
2019-07-09 16:25:18 +08:00
2020-07-07 06:27:53 +08:00
def test_pickle_added_tokens ( self ) :
tok1 = AddedToken ( " <s> " , rstrip = True , lstrip = True , normalized = False , single_word = True )
tok2 = pickle . loads ( pickle . dumps ( tok1 ) )
self . assertEqual ( tok1 . __getstate__ ( ) , tok2 . __getstate__ ( ) )
2019-12-22 22:34:15 +08:00
def test_added_tokens_do_lower_case ( self ) :
2020-06-16 05:12:51 +08:00
# TODO(thom) activate fast tokenizer tests once Rust tokenizers accepts white spaces in added tokens
tokenizers = self . get_tokenizers ( fast = False , do_lower_case = True )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
special_token = tokenizer . all_special_tokens [ 0 ]
2019-12-04 23:53:25 +08:00
2020-06-16 05:12:51 +08:00
text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token
text2 = special_token + " AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l " + special_token
Consider do_lower_case in PreTrainedTokenizer
As pointed out in #1545, when using an uncased model, and adding
a new uncased token, the tokenizer does not correctly identify this
in the case that the input text contains the token in a cased format.
For instance, if we load bert-base-uncased into BertTokenizer, and
then use .add_tokens() to add "cool-token", we get the expected
result for .tokenize('this is a cool-token'). However, we get a
possibly unexpected result for .tokenize('this is a cOOl-Token'),
which in fact mirrors the result for the former from before the new
token was added.
This commit adds
- functionality to PreTrainedTokenizer to handle this
situation in case a tokenizer (currently Bert, DistilBert,
and XLNet) has the do_lower_case=True kwarg by:
1) lowercasing tokens added with .add_tokens()
2) lowercasing text at the beginning of .tokenize()
- new common test case for tokenizers
https://github.com/huggingface/transformers/issues/1545
2019-11-06 19:18:16 +08:00
2020-06-16 05:12:51 +08:00
toks0 = tokenizer . tokenize ( text ) # toks before adding new_toks
Consider do_lower_case in PreTrainedTokenizer
As pointed out in #1545, when using an uncased model, and adding
a new uncased token, the tokenizer does not correctly identify this
in the case that the input text contains the token in a cased format.
For instance, if we load bert-base-uncased into BertTokenizer, and
then use .add_tokens() to add "cool-token", we get the expected
result for .tokenize('this is a cool-token'). However, we get a
possibly unexpected result for .tokenize('this is a cOOl-Token'),
which in fact mirrors the result for the former from before the new
token was added.
This commit adds
- functionality to PreTrainedTokenizer to handle this
situation in case a tokenizer (currently Bert, DistilBert,
and XLNet) has the do_lower_case=True kwarg by:
1) lowercasing tokens added with .add_tokens()
2) lowercasing text at the beginning of .tokenize()
- new common test case for tokenizers
https://github.com/huggingface/transformers/issues/1545
2019-11-06 19:18:16 +08:00
2020-06-16 05:12:51 +08:00
new_toks = [ " aaaaa bbbbbb " , " cccccccccdddddddd " , " AAAAA BBBBBB " , " CCCCCCCCCDDDDDDDD " ]
added = tokenizer . add_tokens ( new_toks )
self . assertEqual ( added , 2 )
Consider do_lower_case in PreTrainedTokenizer
As pointed out in #1545, when using an uncased model, and adding
a new uncased token, the tokenizer does not correctly identify this
in the case that the input text contains the token in a cased format.
For instance, if we load bert-base-uncased into BertTokenizer, and
then use .add_tokens() to add "cool-token", we get the expected
result for .tokenize('this is a cool-token'). However, we get a
possibly unexpected result for .tokenize('this is a cOOl-Token'),
which in fact mirrors the result for the former from before the new
token was added.
This commit adds
- functionality to PreTrainedTokenizer to handle this
situation in case a tokenizer (currently Bert, DistilBert,
and XLNet) has the do_lower_case=True kwarg by:
1) lowercasing tokens added with .add_tokens()
2) lowercasing text at the beginning of .tokenize()
- new common test case for tokenizers
https://github.com/huggingface/transformers/issues/1545
2019-11-06 19:18:16 +08:00
2020-06-16 05:12:51 +08:00
toks = tokenizer . tokenize ( text )
toks2 = tokenizer . tokenize ( text2 )
Consider do_lower_case in PreTrainedTokenizer
As pointed out in #1545, when using an uncased model, and adding
a new uncased token, the tokenizer does not correctly identify this
in the case that the input text contains the token in a cased format.
For instance, if we load bert-base-uncased into BertTokenizer, and
then use .add_tokens() to add "cool-token", we get the expected
result for .tokenize('this is a cool-token'). However, we get a
possibly unexpected result for .tokenize('this is a cOOl-Token'),
which in fact mirrors the result for the former from before the new
token was added.
This commit adds
- functionality to PreTrainedTokenizer to handle this
situation in case a tokenizer (currently Bert, DistilBert,
and XLNet) has the do_lower_case=True kwarg by:
1) lowercasing tokens added with .add_tokens()
2) lowercasing text at the beginning of .tokenize()
- new common test case for tokenizers
https://github.com/huggingface/transformers/issues/1545
2019-11-06 19:18:16 +08:00
2020-06-16 05:12:51 +08:00
self . assertEqual ( len ( toks ) , len ( toks2 ) )
self . assertListEqual ( toks , toks2 )
if not isinstance ( tokenizer , PreTrainedTokenizerFast ) :
# Python tokenizers can have added tokens with spaces inside them
# cf https://github.com/huggingface/tokenizers/issues/302
self . assertNotEqual ( len ( toks ) , len ( toks0 ) ) # toks0 should be longer
Consider do_lower_case in PreTrainedTokenizer
As pointed out in #1545, when using an uncased model, and adding
a new uncased token, the tokenizer does not correctly identify this
in the case that the input text contains the token in a cased format.
For instance, if we load bert-base-uncased into BertTokenizer, and
then use .add_tokens() to add "cool-token", we get the expected
result for .tokenize('this is a cool-token'). However, we get a
possibly unexpected result for .tokenize('this is a cOOl-Token'),
which in fact mirrors the result for the former from before the new
token was added.
This commit adds
- functionality to PreTrainedTokenizer to handle this
situation in case a tokenizer (currently Bert, DistilBert,
and XLNet) has the do_lower_case=True kwarg by:
1) lowercasing tokens added with .add_tokens()
2) lowercasing text at the beginning of .tokenize()
- new common test case for tokenizers
https://github.com/huggingface/transformers/issues/1545
2019-11-06 19:18:16 +08:00
2020-06-16 05:12:51 +08:00
# Check that none of the special tokens are lowercased
sequence_with_special_tokens = " A " + " yEs " . join ( tokenizer . all_special_tokens ) + " B "
tokenized_sequence = tokenizer . tokenize ( sequence_with_special_tokens )
2019-07-09 16:25:18 +08:00
2020-06-16 05:12:51 +08:00
for special_token in tokenizer . all_special_tokens :
self . assertTrue ( special_token in tokenized_sequence )
2019-07-09 16:25:18 +08:00
2020-06-16 05:12:51 +08:00
tokenizers = self . get_tokenizers ( fast = False , do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
special_token = tokenizer . all_special_tokens [ 0 ]
2019-07-09 16:25:18 +08:00
2020-06-16 05:12:51 +08:00
text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token
text2 = special_token + " AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l " + special_token
2019-07-09 16:25:18 +08:00
2020-06-16 05:12:51 +08:00
new_toks = [ " aaaaa bbbbbb " , " cccccccccdddddddd " , " AAAAA BBBBBB " , " CCCCCCCCCDDDDDDDD " ]
2019-07-05 17:20:27 +08:00
2020-06-16 05:12:51 +08:00
toks0 = tokenizer . tokenize ( text ) # toks before adding new_toks
2019-09-02 08:27:39 +08:00
2020-06-16 05:12:51 +08:00
added = tokenizer . add_tokens ( new_toks )
self . assertEqual ( added , 4 )
2019-07-05 17:20:27 +08:00
2020-06-16 05:12:51 +08:00
toks = tokenizer . tokenize ( text )
toks2 = tokenizer . tokenize ( text2 )
2019-07-05 17:20:27 +08:00
2020-06-16 05:12:51 +08:00
self . assertEqual ( len ( toks ) , len ( toks2 ) ) # Length should still be the same
self . assertNotEqual ( toks [ 1 ] , toks2 [ 1 ] ) # But at least the first non-special tokens should differ
if not isinstance ( tokenizer , PreTrainedTokenizerFast ) :
# Python tokenizers can have added tokens with spaces inside them
# cf https://github.com/huggingface/tokenizers/issues/302
self . assertNotEqual ( len ( toks ) , len ( toks0 ) ) # toks0 should be longer
2019-07-15 23:30:42 +08:00
2020-06-16 05:12:51 +08:00
def test_add_tokens_tokenizer ( self ) :
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
vocab_size = tokenizer . vocab_size
all_size = len ( tokenizer )
self . assertNotEqual ( vocab_size , 0 )
2020-06-25 03:53:08 +08:00
# We usually have added tokens from the start in tests because our vocab fixtures are
# smaller than the original vocabs - let's not assert this
# self.assertEqual(vocab_size, all_size)
2020-06-16 05:12:51 +08:00
new_toks = [ " aaaaa bbbbbb " , " cccccccccdddddddd " ]
added_toks = tokenizer . add_tokens ( new_toks )
vocab_size_2 = tokenizer . vocab_size
all_size_2 = len ( tokenizer )
self . assertNotEqual ( vocab_size_2 , 0 )
self . assertEqual ( vocab_size , vocab_size_2 )
self . assertEqual ( added_toks , len ( new_toks ) )
self . assertEqual ( all_size_2 , all_size + len ( new_toks ) )
tokens = tokenizer . encode ( " aaaaa bbbbbb low cccccccccdddddddd l " , add_special_tokens = False )
self . assertGreaterEqual ( len ( tokens ) , 4 )
self . assertGreater ( tokens [ 0 ] , tokenizer . vocab_size - 1 )
self . assertGreater ( tokens [ - 2 ] , tokenizer . vocab_size - 1 )
new_toks_2 = { " eos_token " : " >>>>|||<||<<|<< " , " pad_token " : " <<<<<|||>|>>>>|> " }
added_toks_2 = tokenizer . add_special_tokens ( new_toks_2 )
vocab_size_3 = tokenizer . vocab_size
all_size_3 = len ( tokenizer )
self . assertNotEqual ( vocab_size_3 , 0 )
self . assertEqual ( vocab_size , vocab_size_3 )
self . assertEqual ( added_toks_2 , len ( new_toks_2 ) )
self . assertEqual ( all_size_3 , all_size_2 + len ( new_toks_2 ) )
tokens = tokenizer . encode (
" >>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l " , add_special_tokens = False
)
self . assertGreaterEqual ( len ( tokens ) , 6 )
self . assertGreater ( tokens [ 0 ] , tokenizer . vocab_size - 1 )
self . assertGreater ( tokens [ 0 ] , tokens [ 1 ] )
self . assertGreater ( tokens [ - 2 ] , tokenizer . vocab_size - 1 )
self . assertGreater ( tokens [ - 2 ] , tokens [ - 3 ] )
self . assertEqual ( tokens [ 0 ] , tokenizer . eos_token_id )
self . assertEqual ( tokens [ - 2 ] , tokenizer . pad_token_id )
2019-07-16 21:11:29 +08:00
2019-12-22 22:34:15 +08:00
def test_add_special_tokens ( self ) :
2020-06-16 05:12:51 +08:00
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
input_text , ids = self . get_clean_sequence ( tokenizer )
2019-11-13 04:27:57 +08:00
2020-06-16 05:12:51 +08:00
special_token = " [SPECIAL_TOKEN] "
2019-11-13 04:27:57 +08:00
2020-06-16 05:12:51 +08:00
tokenizer . add_special_tokens ( { " cls_token " : special_token } )
encoded_special_token = tokenizer . encode ( special_token , add_special_tokens = False )
self . assertEqual ( len ( encoded_special_token ) , 1 )
2019-11-13 04:27:57 +08:00
2020-06-16 05:12:51 +08:00
text = tokenizer . decode ( ids + encoded_special_token , clean_up_tokenization_spaces = False )
encoded = tokenizer . encode ( text , add_special_tokens = False )
2019-11-13 04:27:57 +08:00
2020-06-16 05:12:51 +08:00
input_encoded = tokenizer . encode ( input_text , add_special_tokens = False )
special_token_id = tokenizer . encode ( special_token , add_special_tokens = False )
self . assertEqual ( encoded , input_encoded + special_token_id )
2019-11-13 04:27:57 +08:00
2020-06-16 05:12:51 +08:00
decoded = tokenizer . decode ( encoded , skip_special_tokens = True )
self . assertTrue ( special_token not in decoded )
2019-07-16 21:11:29 +08:00
2020-05-19 22:46:55 +08:00
def test_internal_consistency ( self ) :
2020-06-16 05:12:51 +08:00
tokenizers = self . get_tokenizers ( )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
input_text , output_text = self . get_input_output_texts ( tokenizer )
2019-07-16 21:11:29 +08:00
2020-06-16 05:12:51 +08:00
tokens = tokenizer . tokenize ( input_text )
ids = tokenizer . convert_tokens_to_ids ( tokens )
ids_2 = tokenizer . encode ( input_text , add_special_tokens = False )
self . assertListEqual ( ids , ids_2 )
2019-07-16 21:11:29 +08:00
2020-06-16 05:12:51 +08:00
tokens_2 = tokenizer . convert_ids_to_tokens ( ids )
self . assertNotEqual ( len ( tokens_2 ) , 0 )
text_2 = tokenizer . decode ( ids )
self . assertIsInstance ( text_2 , str )
2019-08-05 20:08:56 +08:00
2020-06-16 05:12:51 +08:00
self . assertEqual ( text_2 , output_text )
2019-08-05 20:08:56 +08:00
2019-12-22 22:34:15 +08:00
def test_encode_decode_with_spaces ( self ) :
2020-06-16 05:12:51 +08:00
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
2019-12-12 01:36:37 +08:00
2020-06-16 05:12:51 +08:00
new_toks = [ " [ABC] " , " [DEF] " ] # TODO(thom) add this one back when Rust toks are ready: , "GHI IHG"]
tokenizer . add_tokens ( new_toks )
input = " [ABC] [DEF] [ABC] [DEF] " # TODO(thom) add back cf above: "[ABC] [DEF] [ABC] GHI IHG [DEF]"
encoded = tokenizer . encode ( input , add_special_tokens = False )
decoded = tokenizer . decode ( encoded )
self . assertEqual ( decoded , input )
2019-08-05 20:08:56 +08:00
2019-12-22 22:34:15 +08:00
def test_pretrained_model_lists ( self ) :
weights_list = list ( self . tokenizer_class . max_model_input_sizes . keys ( ) )
weights_lists_2 = [ ]
for file_id , map_list in self . tokenizer_class . pretrained_vocab_files_map . items ( ) :
weights_lists_2 . append ( list ( map_list . keys ( ) ) )
2019-08-05 20:08:56 +08:00
2019-12-22 22:34:15 +08:00
for weights_list_2 in weights_lists_2 :
self . assertListEqual ( weights_list , weights_list_2 )
2019-09-03 05:47:16 +08:00
2019-12-22 22:34:15 +08:00
def test_mask_output ( self ) :
2020-06-16 05:12:51 +08:00
tokenizers = self . get_tokenizers ( fast = False , do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
if (
tokenizer . build_inputs_with_special_tokens . __qualname__ . split ( " . " ) [ 0 ] != " PreTrainedTokenizer "
and " token_type_ids " in tokenizer . model_input_names
) :
seq_0 = " Test this method. "
seq_1 = " With these inputs. "
information = tokenizer . encode_plus ( seq_0 , seq_1 , add_special_tokens = True )
sequences , mask = information [ " input_ids " ] , information [ " token_type_ids " ]
self . assertEqual ( len ( sequences ) , len ( mask ) )
2019-12-22 22:34:15 +08:00
def test_number_of_added_tokens ( self ) :
2020-06-16 05:12:51 +08:00
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
2019-12-22 22:34:15 +08:00
2020-06-16 05:12:51 +08:00
seq_0 = " Test this method. "
seq_1 = " With these inputs. "
2019-12-22 22:34:15 +08:00
2020-06-16 05:12:51 +08:00
sequences = tokenizer . encode ( seq_0 , seq_1 , add_special_tokens = False )
2020-06-23 19:36:57 +08:00
attached_sequences = tokenizer . encode ( seq_0 , seq_1 , add_special_tokens = True )
2019-12-22 22:34:15 +08:00
2020-06-16 05:12:51 +08:00
# Method is implemented (e.g. not GPT-2)
if len ( attached_sequences ) != 2 :
self . assertEqual (
tokenizer . num_special_tokens_to_add ( pair = True ) , len ( attached_sequences ) - len ( sequences )
)
2019-12-22 22:34:15 +08:00
def test_maximum_encoding_length_single_input ( self ) :
2020-06-25 23:24:28 +08:00
tokenizers = self . get_tokenizers ( do_lower_case = False , model_max_length = 100 )
2020-06-16 05:12:51 +08:00
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
2020-06-25 23:24:28 +08:00
seq_0 , ids = self . get_clean_sequence ( tokenizer , max_length = 20 )
2020-06-16 05:12:51 +08:00
sequence = tokenizer . encode ( seq_0 , add_special_tokens = False )
total_length = len ( sequence )
2020-06-25 23:24:28 +08:00
assert total_length > 1 , " Issue with the testing sequence, please update it it ' s too short "
# Test with max model input length
model_max_length = tokenizer . model_max_length
self . assertEqual ( model_max_length , 100 )
seq_1 = seq_0 * model_max_length
sequence1 = tokenizer ( seq_1 , add_special_tokens = False )
total_length1 = len ( sequence1 [ " input_ids " ] )
assert (
total_length1 > model_max_length
) , " Issue with the testing sequence, please update it it ' s too short "
# Simple
padding_strategies = (
[ False , True , " longest " ] if tokenizer . pad_token and tokenizer . pad_token_id > = 0 else [ False ]
)
for padding_state in padding_strategies :
with self . subTest ( f " Padding: { padding_state } " ) :
for truncation_state in [ True , " longest_first " , " only_first " ] :
with self . subTest ( f " Truncation: { truncation_state } " ) :
output = tokenizer ( seq_1 , padding = padding_state , truncation = truncation_state )
self . assertEqual ( len ( output [ " input_ids " ] ) , model_max_length )
output = tokenizer ( [ seq_1 ] , padding = padding_state , truncation = truncation_state )
self . assertEqual ( len ( output [ " input_ids " ] [ 0 ] ) , model_max_length )
# Simple with no truncation
output = tokenizer ( seq_1 , padding = padding_state , truncation = False )
self . assertNotEqual ( len ( output [ " input_ids " ] ) , model_max_length )
output = tokenizer ( [ seq_1 ] , padding = padding_state , truncation = False )
self . assertNotEqual ( len ( output [ " input_ids " ] [ 0 ] ) , model_max_length )
# Overflowing tokens
stride = 2
information = tokenizer (
2020-06-16 05:12:51 +08:00
seq_0 ,
max_length = total_length - 2 ,
add_special_tokens = False ,
stride = stride ,
truncation = " longest_first " ,
return_overflowing_tokens = True ,
2020-06-23 19:36:57 +08:00
# add_prefix_space=False,
2020-06-16 05:12:51 +08:00
)
# Overflowing tokens are handled quite differently in slow and fast tokenizers
if isinstance ( tokenizer , PreTrainedTokenizerFast ) :
truncated_sequence = information [ " input_ids " ] [ 0 ]
overflowing_tokens = information [ " input_ids " ] [ 1 ]
self . assertEqual ( len ( information [ " input_ids " ] ) , 2 )
self . assertEqual ( len ( truncated_sequence ) , total_length - 2 )
self . assertEqual ( truncated_sequence , sequence [ : - 2 ] )
self . assertEqual ( len ( overflowing_tokens ) , 2 + stride )
self . assertEqual ( overflowing_tokens , sequence [ - ( 2 + stride ) : ] )
else :
truncated_sequence = information [ " input_ids " ]
overflowing_tokens = information [ " overflowing_tokens " ]
2019-12-22 22:34:15 +08:00
2020-06-16 05:12:51 +08:00
self . assertEqual ( len ( truncated_sequence ) , total_length - 2 )
self . assertEqual ( truncated_sequence , sequence [ : - 2 ] )
2019-12-22 22:34:15 +08:00
2020-07-03 22:51:21 +08:00
self . assertEqual ( len ( overflowing_tokens ) , 2 + stride )
2020-02-14 02:29:43 +08:00
2020-06-16 05:12:51 +08:00
def test_maximum_encoding_length_pair_input ( self ) :
2020-06-25 23:24:28 +08:00
tokenizers = self . get_tokenizers ( do_lower_case = False , model_max_length = 100 )
2020-06-16 05:12:51 +08:00
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
# Build a sequence from our model's vocabulary
stride = 2
2020-06-25 23:24:28 +08:00
seq_0 , ids = self . get_clean_sequence ( tokenizer , max_length = 20 )
2020-06-16 05:12:51 +08:00
if len ( ids ) < = 2 + stride :
2020-06-25 23:24:28 +08:00
seq_0 = ( seq_0 + " " ) * ( 2 + stride )
ids = None
2020-06-16 05:12:51 +08:00
seq0_tokens = tokenizer . encode ( seq_0 , add_special_tokens = False )
assert len ( seq0_tokens ) > 2 + stride
seq_1 = " This is another sentence to be encoded. "
seq1_tokens = tokenizer . encode ( seq_1 , add_special_tokens = False )
2020-06-25 23:24:28 +08:00
if abs ( len ( seq0_tokens ) - len ( seq1_tokens ) ) < = 2 :
2020-06-16 05:12:51 +08:00
seq1_tokens = seq1_tokens + seq1_tokens
seq_1 = tokenizer . decode ( seq1_tokens , clean_up_tokenization_spaces = False )
seq1_tokens = tokenizer . encode ( seq_1 , add_special_tokens = False )
assert len ( seq1_tokens ) > 2 + stride
smallest = seq1_tokens if len ( seq0_tokens ) > len ( seq1_tokens ) else seq0_tokens
# We are not using the special tokens - a bit too hard to test all the tokenizers with this
# TODO try this again later
2020-06-23 19:36:57 +08:00
sequence = tokenizer . encode ( seq_0 , seq_1 , add_special_tokens = False ) # , add_prefix_space=False)
2020-06-25 23:24:28 +08:00
# Test with max model input length
model_max_length = tokenizer . model_max_length
self . assertEqual ( model_max_length , 100 )
seq_2 = seq_0 * model_max_length
sequence1 = tokenizer ( seq_1 , add_special_tokens = False )
total_length1 = len ( sequence1 [ " input_ids " ] )
sequence2 = tokenizer ( seq_2 , seq_1 , add_special_tokens = False )
total_length2 = len ( sequence2 [ " input_ids " ] )
assert total_length1 < model_max_length - 10 , " Issue with the testing sequence, please update it. "
assert total_length2 > model_max_length , " Issue with the testing sequence, please update it. "
# Simple
padding_strategies = (
[ False , True , " longest " ] if tokenizer . pad_token and tokenizer . pad_token_id > = 0 else [ False ]
)
for padding_state in padding_strategies :
with self . subTest ( f " Padding: { padding_state } " ) :
for truncation_state in [ True , " longest_first " , " only_first " ] :
with self . subTest ( f " Truncation: { truncation_state } " ) :
output = tokenizer ( seq_2 , seq_1 , padding = padding_state , truncation = truncation_state )
self . assertEqual ( len ( output [ " input_ids " ] ) , model_max_length )
output = tokenizer (
[ seq_2 ] , [ seq_1 ] , padding = padding_state , truncation = truncation_state
)
self . assertEqual ( len ( output [ " input_ids " ] [ 0 ] ) , model_max_length )
# Simple
output = tokenizer ( seq_1 , seq_2 , padding = padding_state , truncation = " only_second " )
self . assertEqual ( len ( output [ " input_ids " ] ) , model_max_length )
output = tokenizer ( [ seq_1 ] , [ seq_2 ] , padding = padding_state , truncation = " only_second " )
self . assertEqual ( len ( output [ " input_ids " ] [ 0 ] ) , model_max_length )
# Simple with no truncation
output = tokenizer ( seq_1 , seq_2 , padding = padding_state , truncation = False )
self . assertNotEqual ( len ( output [ " input_ids " ] ) , model_max_length )
output = tokenizer ( [ seq_1 ] , [ seq_2 ] , padding = padding_state , truncation = False )
self . assertNotEqual ( len ( output [ " input_ids " ] [ 0 ] ) , model_max_length )
2020-06-16 05:12:51 +08:00
truncated_first_sequence = tokenizer . encode ( seq_0 , add_special_tokens = False ) [ : - 2 ] + tokenizer . encode (
seq_1 , add_special_tokens = False
)
truncated_second_sequence = (
tokenizer . encode ( seq_0 , add_special_tokens = False )
+ tokenizer . encode ( seq_1 , add_special_tokens = False ) [ : - 2 ]
)
truncated_longest_sequence = (
truncated_first_sequence if len ( seq0_tokens ) > len ( seq1_tokens ) else truncated_second_sequence
)
overflow_first_sequence = tokenizer . encode ( seq_0 , add_special_tokens = False ) [
- ( 2 + stride ) :
] + tokenizer . encode ( seq_1 , add_special_tokens = False )
overflow_second_sequence = (
tokenizer . encode ( seq_0 , add_special_tokens = False )
+ tokenizer . encode ( seq_1 , add_special_tokens = False ) [ - ( 2 + stride ) : ]
)
overflow_longest_sequence = (
overflow_first_sequence if len ( seq0_tokens ) > len ( seq1_tokens ) else overflow_second_sequence
)
information = tokenizer . encode_plus (
seq_0 ,
seq_1 ,
max_length = len ( sequence ) - 2 ,
add_special_tokens = False ,
stride = stride ,
truncation = " longest_first " ,
return_overflowing_tokens = True ,
2020-06-23 19:36:57 +08:00
# add_prefix_space=False,
2020-06-16 05:12:51 +08:00
)
# Overflowing tokens are handled quite differently in slow and fast tokenizers
if isinstance ( tokenizer , PreTrainedTokenizerFast ) :
truncated_sequence = information [ " input_ids " ] [ 0 ]
overflowing_tokens = information [ " input_ids " ] [ 1 ]
self . assertEqual ( len ( information [ " input_ids " ] ) , 2 )
self . assertEqual ( len ( truncated_sequence ) , len ( sequence ) - 2 )
self . assertEqual ( truncated_sequence , truncated_longest_sequence )
self . assertEqual ( len ( overflowing_tokens ) , 2 + stride + len ( smallest ) )
self . assertEqual ( overflowing_tokens , overflow_longest_sequence )
else :
truncated_sequence = information [ " input_ids " ]
overflowing_tokens = information [ " overflowing_tokens " ]
self . assertEqual ( len ( truncated_sequence ) , len ( sequence ) - 2 )
self . assertEqual ( truncated_sequence , truncated_longest_sequence )
self . assertEqual (
2020-07-03 22:51:21 +08:00
len ( overflowing_tokens ) , 2 + stride
2020-06-16 05:12:51 +08:00
) # No overflowing tokens when using 'longest' in python tokenizers
2020-07-03 22:51:21 +08:00
information = tokenizer . encode_plus (
2020-06-16 05:12:51 +08:00
seq_0 ,
seq_1 ,
max_length = len ( sequence ) - 2 ,
add_special_tokens = False ,
stride = stride ,
truncation = True ,
return_overflowing_tokens = True ,
2020-06-23 19:36:57 +08:00
# add_prefix_space=False,
2020-06-16 05:12:51 +08:00
)
# Overflowing tokens are handled quite differently in slow and fast tokenizers
2020-07-03 22:51:21 +08:00
if isinstance ( tokenizer , PreTrainedTokenizerFast ) :
truncated_sequence = information [ " input_ids " ] [ 0 ]
overflowing_tokens = information [ " input_ids " ] [ 1 ]
self . assertEqual ( len ( information [ " input_ids " ] ) , 2 )
self . assertEqual ( len ( truncated_sequence ) , len ( sequence ) - 2 )
self . assertEqual ( truncated_sequence , truncated_longest_sequence )
self . assertEqual ( len ( overflowing_tokens ) , 2 + stride + len ( smallest ) )
self . assertEqual ( overflowing_tokens , overflow_longest_sequence )
else :
truncated_sequence = information [ " input_ids " ]
overflowing_tokens = information [ " overflowing_tokens " ]
self . assertEqual ( len ( truncated_sequence ) , len ( sequence ) - 2 )
self . assertEqual ( truncated_sequence , truncated_longest_sequence )
self . assertEqual (
len ( overflowing_tokens ) , 2 + stride
) # No overflowing tokens when using 'longest' in python tokenizers
information_first_truncated = tokenizer . encode_plus (
seq_0 ,
seq_1 ,
max_length = len ( sequence ) - 2 ,
add_special_tokens = False ,
stride = stride ,
truncation = " only_first " ,
return_overflowing_tokens = True ,
# add_prefix_space=False,
)
# Overflowing tokens are handled quite differently in slow and fast tokenizers
2020-06-16 05:12:51 +08:00
if isinstance ( tokenizer , PreTrainedTokenizerFast ) :
truncated_sequence = information_first_truncated [ " input_ids " ] [ 0 ]
overflowing_tokens = information_first_truncated [ " input_ids " ] [ 1 ]
self . assertEqual ( len ( information_first_truncated [ " input_ids " ] ) , 2 )
self . assertEqual ( len ( truncated_sequence ) , len ( sequence ) - 2 )
self . assertEqual ( truncated_sequence , truncated_first_sequence )
self . assertEqual ( len ( overflowing_tokens ) , 2 + stride + len ( seq1_tokens ) )
self . assertEqual ( overflowing_tokens , overflow_first_sequence )
else :
truncated_sequence = information_first_truncated [ " input_ids " ]
overflowing_tokens = information_first_truncated [ " overflowing_tokens " ]
self . assertEqual ( len ( truncated_sequence ) , len ( sequence ) - 2 )
self . assertEqual ( truncated_sequence , truncated_first_sequence )
self . assertEqual ( len ( overflowing_tokens ) , 2 + stride )
self . assertEqual ( overflowing_tokens , seq0_tokens [ - ( 2 + stride ) : ] )
information_second_truncated = tokenizer . encode_plus (
seq_0 ,
seq_1 ,
max_length = len ( sequence ) - 2 ,
add_special_tokens = False ,
stride = stride ,
truncation = " only_second " ,
return_overflowing_tokens = True ,
2020-06-23 19:36:57 +08:00
# add_prefix_space=False,
2020-06-16 05:12:51 +08:00
)
# Overflowing tokens are handled quite differently in slow and fast tokenizers
if isinstance ( tokenizer , PreTrainedTokenizerFast ) :
truncated_sequence = information_second_truncated [ " input_ids " ] [ 0 ]
overflowing_tokens = information_second_truncated [ " input_ids " ] [ 1 ]
self . assertEqual ( len ( information_second_truncated [ " input_ids " ] ) , 2 )
self . assertEqual ( len ( truncated_sequence ) , len ( sequence ) - 2 )
self . assertEqual ( truncated_sequence , truncated_second_sequence )
self . assertEqual ( len ( overflowing_tokens ) , 2 + stride + len ( seq0_tokens ) )
self . assertEqual ( overflowing_tokens , overflow_second_sequence )
else :
truncated_sequence = information_second_truncated [ " input_ids " ]
overflowing_tokens = information_second_truncated [ " overflowing_tokens " ]
self . assertEqual ( len ( truncated_sequence ) , len ( sequence ) - 2 )
self . assertEqual ( truncated_sequence , truncated_second_sequence )
2020-02-14 02:29:43 +08:00
2020-06-16 05:12:51 +08:00
self . assertEqual ( len ( overflowing_tokens ) , 2 + stride )
self . assertEqual ( overflowing_tokens , seq1_tokens [ - ( 2 + stride ) : ] )
2020-02-14 02:29:43 +08:00
2020-06-16 05:12:51 +08:00
# def test_encode_input_type(self):
# tokenizers = self.get_tokenizers(do_lower_case=False)
# for tokenizer in tokenizers:
# with self.subTest(f"{tokenizer.__class__.__name__}"):
# sequence = "Let's encode this sequence"
2020-02-14 02:29:43 +08:00
2020-06-16 05:12:51 +08:00
# tokens = sequence.split() # tokenizer.tokenize(sequence)
# # input_ids = tokenizer.convert_tokens_to_ids(tokens)
# formatted_input = tokenizer.encode(sequence, add_special_tokens=True, add_prefix_space=False)
2020-02-14 02:29:43 +08:00
2020-06-16 05:12:51 +08:00
# self.assertEqual(
# tokenizer.encode(tokens, is_pretokenized=True, add_special_tokens=True), formatted_input
# )
# # This is not supported with the Rust tokenizers
# # self.assertEqual(tokenizer.encode(input_ids, add_special_tokens=True), formatted_input)
2020-02-14 02:29:43 +08:00
2020-06-16 05:12:51 +08:00
def test_swap_special_token ( self ) :
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
mask = " <mask> "
sequence = " Encode this sequence "
sequence_masked_0 = " Encode <mask> sequence "
sequence_masked_1 = " <mask> this sequence "
# Add tokens so that masked token isn't split
tokenizer . add_tokens ( sequence . split ( ) )
tokenizer . add_special_tokens ( { " mask_token " : mask } )
mask_ind = tokenizer . convert_tokens_to_ids ( mask )
encoded = tokenizer . encode ( sequence , add_special_tokens = False )
# Test first masked sequence
encoded_masked = tokenizer . encode ( sequence_masked_0 , add_special_tokens = False )
mask_loc = encoded_masked . index ( mask_ind )
encoded_masked [ mask_loc ] = encoded [ mask_loc ]
self . assertEqual ( encoded_masked , encoded )
# Test second masked sequence
encoded_masked = tokenizer . encode ( sequence_masked_1 , add_special_tokens = False )
mask_loc = encoded_masked . index ( mask_ind )
encoded_masked [ mask_loc ] = encoded [ mask_loc ]
self . assertEqual ( encoded_masked , encoded )
2020-02-14 02:29:43 +08:00
2019-12-22 22:34:15 +08:00
def test_special_tokens_mask ( self ) :
2020-06-16 05:12:51 +08:00
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
sequence_0 = " Encode this. "
# Testing single inputs
encoded_sequence = tokenizer . encode ( sequence_0 , add_special_tokens = False )
encoded_sequence_dict = tokenizer . encode_plus (
2020-06-23 19:36:57 +08:00
sequence_0 , add_special_tokens = True , return_special_tokens_mask = True # , add_prefix_space=False
2020-06-16 05:12:51 +08:00
)
encoded_sequence_w_special = encoded_sequence_dict [ " input_ids " ]
special_tokens_mask = encoded_sequence_dict [ " special_tokens_mask " ]
self . assertEqual ( len ( special_tokens_mask ) , len ( encoded_sequence_w_special ) )
filtered_sequence = [ x for i , x in enumerate ( encoded_sequence_w_special ) if not special_tokens_mask [ i ] ]
self . assertEqual ( encoded_sequence , filtered_sequence )
2019-12-22 22:34:15 +08:00
2020-05-19 22:46:55 +08:00
def test_special_tokens_mask_input_pairs ( self ) :
2020-06-16 05:12:51 +08:00
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
sequence_0 = " Encode this. "
sequence_1 = " This one too please. "
encoded_sequence = tokenizer . encode ( sequence_0 , add_special_tokens = False )
encoded_sequence + = tokenizer . encode ( sequence_1 , add_special_tokens = False )
encoded_sequence_dict = tokenizer . encode_plus (
sequence_0 ,
sequence_1 ,
add_special_tokens = True ,
return_special_tokens_mask = True ,
2020-06-23 19:36:57 +08:00
# add_prefix_space=False,
2020-06-16 05:12:51 +08:00
)
encoded_sequence_w_special = encoded_sequence_dict [ " input_ids " ]
special_tokens_mask = encoded_sequence_dict [ " special_tokens_mask " ]
self . assertEqual ( len ( special_tokens_mask ) , len ( encoded_sequence_w_special ) )
filtered_sequence = [
( x if not special_tokens_mask [ i ] else None ) for i , x in enumerate ( encoded_sequence_w_special )
]
filtered_sequence = [ x for x in filtered_sequence if x is not None ]
self . assertEqual ( encoded_sequence , filtered_sequence )
2019-12-22 22:34:15 +08:00
2020-06-16 05:12:51 +08:00
def test_right_and_left_padding ( self ) :
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
sequence = " Sequence "
padding_size = 10
# check correct behaviour if no pad_token_id exists and add it eventually
self . _check_no_pad_token_padding ( tokenizer , sequence )
padding_idx = tokenizer . pad_token_id
# RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
tokenizer . padding_side = " right "
encoded_sequence = tokenizer . encode ( sequence )
sequence_length = len ( encoded_sequence )
padded_sequence = tokenizer . encode (
sequence , max_length = sequence_length + padding_size , padding = " max_length "
)
padded_sequence_length = len ( padded_sequence )
assert sequence_length + padding_size == padded_sequence_length
assert encoded_sequence + [ padding_idx ] * padding_size == padded_sequence
# LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
tokenizer . padding_side = " left "
encoded_sequence = tokenizer . encode ( sequence )
sequence_length = len ( encoded_sequence )
padded_sequence = tokenizer . encode (
sequence , max_length = sequence_length + padding_size , padding = " max_length "
)
padded_sequence_length = len ( padded_sequence )
assert sequence_length + padding_size == padded_sequence_length
assert [ padding_idx ] * padding_size + encoded_sequence == padded_sequence
# RIGHT & LEFT PADDING - Check that nothing is done for 'longest' and 'no_padding'
encoded_sequence = tokenizer . encode ( sequence )
sequence_length = len ( encoded_sequence )
tokenizer . padding_side = " right "
padded_sequence_right = tokenizer . encode ( sequence , padding = True )
padded_sequence_right_length = len ( padded_sequence_right )
assert sequence_length == padded_sequence_right_length
assert encoded_sequence == padded_sequence_right
tokenizer . padding_side = " left "
padded_sequence_left = tokenizer . encode ( sequence , padding = " longest " )
padded_sequence_left_length = len ( padded_sequence_left )
assert sequence_length == padded_sequence_left_length
assert encoded_sequence == padded_sequence_left
tokenizer . padding_side = " right "
padded_sequence_right = tokenizer . encode ( sequence )
padded_sequence_right_length = len ( padded_sequence_right )
assert sequence_length == padded_sequence_right_length
assert encoded_sequence == padded_sequence_right
tokenizer . padding_side = " left "
padded_sequence_left = tokenizer . encode ( sequence , padding = False )
padded_sequence_left_length = len ( padded_sequence_left )
assert sequence_length == padded_sequence_left_length
assert encoded_sequence == padded_sequence_left
2019-12-22 22:34:15 +08:00
def test_padding_to_max_length ( self ) :
2020-06-16 05:12:51 +08:00
""" We keep this test for backward compatibility but it should be remove when `pad_to_max_length` will e deprecated
"""
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
sequence = " Sequence "
padding_size = 10
# check correct behaviour if no pad_token_id exists and add it eventually
self . _check_no_pad_token_padding ( tokenizer , sequence )
padding_idx = tokenizer . pad_token_id
# Check that it correctly pads when a maximum length is specified along with the padding flag set to True
tokenizer . padding_side = " right "
encoded_sequence = tokenizer . encode ( sequence )
sequence_length = len ( encoded_sequence )
2020-07-08 01:23:01 +08:00
# FIXME: the next line should be padding(max_length) to avoid warning
2020-06-16 05:12:51 +08:00
padded_sequence = tokenizer . encode (
sequence , max_length = sequence_length + padding_size , pad_to_max_length = True
)
padded_sequence_length = len ( padded_sequence )
assert sequence_length + padding_size == padded_sequence_length
assert encoded_sequence + [ padding_idx ] * padding_size == padded_sequence
# Check that nothing is done when a maximum length is not specified
encoded_sequence = tokenizer . encode ( sequence )
sequence_length = len ( encoded_sequence )
tokenizer . padding_side = " right "
padded_sequence_right = tokenizer . encode ( sequence , pad_to_max_length = True )
padded_sequence_right_length = len ( padded_sequence_right )
assert sequence_length == padded_sequence_right_length
assert encoded_sequence == padded_sequence_right
2019-12-22 22:34:15 +08:00
2020-06-26 17:55:57 +08:00
def test_padding_to_multiple_of ( self ) :
tokenizers = self . get_tokenizers ( )
for tokenizer in tokenizers :
if tokenizer . pad_token is None :
self . skipTest ( " No padding token. " )
else :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
empty_tokens = tokenizer ( " " , padding = True , pad_to_multiple_of = 8 )
normal_tokens = tokenizer ( " This is a sample input " , padding = True , pad_to_multiple_of = 8 )
for key , value in empty_tokens . items ( ) :
self . assertEqual ( len ( value ) % 8 , 0 , " BatchEncoding. {} is not multiple of 8 " . format ( key ) )
for key , value in normal_tokens . items ( ) :
self . assertEqual ( len ( value ) % 8 , 0 , " BatchEncoding. {} is not multiple of 8 " . format ( key ) )
normal_tokens = tokenizer ( " This " , pad_to_multiple_of = 8 )
for key , value in normal_tokens . items ( ) :
self . assertNotEqual ( len ( value ) % 8 , 0 , " BatchEncoding. {} is not multiple of 8 " . format ( key ) )
# Should also work with truncation
normal_tokens = tokenizer ( " This " , padding = True , truncation = True , pad_to_multiple_of = 8 )
for key , value in normal_tokens . items ( ) :
self . assertEqual ( len ( value ) % 8 , 0 , " BatchEncoding. {} is not multiple of 8 " . format ( key ) )
# truncation to something which is not a multiple of pad_to_multiple_of raises an error
self . assertRaises (
ValueError ,
tokenizer . __call__ ,
" This " ,
padding = True ,
truncation = True ,
max_length = 12 ,
pad_to_multiple_of = 8 ,
)
2019-12-22 22:34:15 +08:00
def test_encode_plus_with_padding ( self ) :
2020-06-16 05:12:51 +08:00
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
sequence = " Sequence "
# check correct behaviour if no pad_token_id exists and add it eventually
self . _check_no_pad_token_padding ( tokenizer , sequence )
padding_size = 10
padding_idx = tokenizer . pad_token_id
token_type_padding_idx = tokenizer . pad_token_type_id
encoded_sequence = tokenizer . encode_plus ( sequence , return_special_tokens_mask = True )
input_ids = encoded_sequence [ " input_ids " ]
special_tokens_mask = encoded_sequence [ " special_tokens_mask " ]
sequence_length = len ( input_ids )
# Test 'longest' and 'no_padding' don't do anything
tokenizer . padding_side = " right "
not_padded_sequence = tokenizer . encode_plus ( sequence , padding = True , return_special_tokens_mask = True , )
not_padded_input_ids = not_padded_sequence [ " input_ids " ]
not_padded_special_tokens_mask = not_padded_sequence [ " special_tokens_mask " ]
not_padded_sequence_length = len ( not_padded_input_ids )
assert sequence_length == not_padded_sequence_length
assert input_ids == not_padded_input_ids
assert special_tokens_mask == not_padded_special_tokens_mask
not_padded_sequence = tokenizer . encode_plus ( sequence , padding = False , return_special_tokens_mask = True , )
not_padded_input_ids = not_padded_sequence [ " input_ids " ]
not_padded_special_tokens_mask = not_padded_sequence [ " special_tokens_mask " ]
not_padded_sequence_length = len ( not_padded_input_ids )
assert sequence_length == not_padded_sequence_length
assert input_ids == not_padded_input_ids
assert special_tokens_mask == not_padded_special_tokens_mask
# Test right padding
tokenizer . padding_side = " right "
right_padded_sequence = tokenizer . encode_plus (
sequence ,
max_length = sequence_length + padding_size ,
padding = " max_length " ,
return_special_tokens_mask = True ,
)
right_padded_input_ids = right_padded_sequence [ " input_ids " ]
right_padded_special_tokens_mask = right_padded_sequence [ " special_tokens_mask " ]
right_padded_sequence_length = len ( right_padded_input_ids )
assert sequence_length + padding_size == right_padded_sequence_length
assert input_ids + [ padding_idx ] * padding_size == right_padded_input_ids
assert special_tokens_mask + [ 1 ] * padding_size == right_padded_special_tokens_mask
# Test left padding
tokenizer . padding_side = " left "
left_padded_sequence = tokenizer . encode_plus (
sequence ,
max_length = sequence_length + padding_size ,
padding = " max_length " ,
return_special_tokens_mask = True ,
)
left_padded_input_ids = left_padded_sequence [ " input_ids " ]
left_padded_special_tokens_mask = left_padded_sequence [ " special_tokens_mask " ]
left_padded_sequence_length = len ( left_padded_input_ids )
assert sequence_length + padding_size == left_padded_sequence_length
assert [ padding_idx ] * padding_size + input_ids == left_padded_input_ids
assert [ 1 ] * padding_size + special_tokens_mask == left_padded_special_tokens_mask
if " token_type_ids " in tokenizer . model_input_names :
token_type_ids = encoded_sequence [ " token_type_ids " ]
left_padded_token_type_ids = left_padded_sequence [ " token_type_ids " ]
right_padded_token_type_ids = right_padded_sequence [ " token_type_ids " ]
assert token_type_ids + [ token_type_padding_idx ] * padding_size == right_padded_token_type_ids
assert [ token_type_padding_idx ] * padding_size + token_type_ids == left_padded_token_type_ids
if " attention_mask " in tokenizer . model_input_names :
attention_mask = encoded_sequence [ " attention_mask " ]
right_padded_attention_mask = right_padded_sequence [ " attention_mask " ]
left_padded_attention_mask = left_padded_sequence [ " attention_mask " ]
assert attention_mask + [ 0 ] * padding_size == right_padded_attention_mask
assert [ 0 ] * padding_size + attention_mask == left_padded_attention_mask
2020-01-30 05:15:39 +08:00
def test_separate_tokenizers ( self ) :
# This tests that tokenizers don't impact others. Unfortunately the case where it fails is when
# we're loading an S3 configuration from a pre-trained identifier, and we have no way of testing those today.
tokenizer = self . get_tokenizer ( random_argument = True )
2020-01-30 05:29:20 +08:00
assert tokenizer . init_kwargs [ " random_argument " ] is True
2020-01-30 05:15:39 +08:00
new_tokenizer = self . get_tokenizer ( random_argument = False )
2020-01-30 05:29:20 +08:00
assert tokenizer . init_kwargs [ " random_argument " ] is True
assert new_tokenizer . init_kwargs [ " random_argument " ] is False
2020-02-21 04:25:46 +08:00
def test_get_vocab ( self ) :
2020-06-16 05:12:51 +08:00
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
vocab = tokenizer . get_vocab ( )
2020-02-21 04:25:46 +08:00
2020-06-16 05:12:51 +08:00
self . assertIsInstance ( vocab , dict )
self . assertEqual ( len ( vocab ) , len ( tokenizer ) )
2020-02-21 04:25:46 +08:00
2020-06-16 05:12:51 +08:00
tokenizer . add_tokens ( [ " asdfasdfasdfasdf " ] )
vocab = tokenizer . get_vocab ( )
self . assertIsInstance ( vocab , dict )
self . assertEqual ( len ( vocab ) , len ( tokenizer ) )
2020-02-21 04:25:46 +08:00
2020-05-19 22:46:55 +08:00
def test_conversion_reversible ( self ) :
2020-06-16 05:12:51 +08:00
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
vocab = tokenizer . get_vocab ( )
for word , ind in vocab . items ( ) :
self . assertEqual ( tokenizer . convert_tokens_to_ids ( word ) , ind )
self . assertEqual ( tokenizer . convert_ids_to_tokens ( ind ) , word )
def test_call ( self ) :
# Tests that all call wrap to encode_plus and batch_encode_plus
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
sequences = [
" Testing batch encode plus " ,
" Testing batch encode plus with different sequence lengths " ,
" Testing batch encode plus with different sequence lengths correctly pads " ,
]
# Test not batched
encoded_sequences_1 = tokenizer . encode_plus ( sequences [ 0 ] )
encoded_sequences_2 = tokenizer ( sequences [ 0 ] )
self . assertEqual ( encoded_sequences_1 , encoded_sequences_2 )
# Test not batched pairs
encoded_sequences_1 = tokenizer . encode_plus ( sequences [ 0 ] , sequences [ 1 ] )
encoded_sequences_2 = tokenizer ( sequences [ 0 ] , sequences [ 1 ] )
self . assertEqual ( encoded_sequences_1 , encoded_sequences_2 )
# Test batched
encoded_sequences_1 = tokenizer . batch_encode_plus ( sequences )
encoded_sequences_2 = tokenizer ( sequences )
self . assertEqual ( encoded_sequences_1 , encoded_sequences_2 )
# Test batched pairs
encoded_sequences_1 = tokenizer . batch_encode_plus ( list ( zip ( sequences , sequences ) ) )
encoded_sequences_2 = tokenizer ( sequences , sequences )
self . assertEqual ( encoded_sequences_1 , encoded_sequences_2 )
2020-02-25 01:09:46 +08:00
def test_batch_encode_plus_batch_sequence_length ( self ) :
# Tests that all encoded values have the correct size
2020-06-16 05:12:51 +08:00
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
sequences = [
" Testing batch encode plus " ,
" Testing batch encode plus with different sequence lengths " ,
" Testing batch encode plus with different sequence lengths correctly pads " ,
]
encoded_sequences = [ tokenizer . encode_plus ( sequence ) for sequence in sequences ]
encoded_sequences_batch = tokenizer . batch_encode_plus ( sequences , padding = False )
self . assertListEqual (
encoded_sequences , self . convert_batch_encode_plus_format_to_encode_plus ( encoded_sequences_batch )
)
maximum_length = len (
max ( [ encoded_sequence [ " input_ids " ] for encoded_sequence in encoded_sequences ] , key = len )
)
# check correct behaviour if no pad_token_id exists and add it eventually
self . _check_no_pad_token_padding ( tokenizer , sequences )
encoded_sequences_padded = [
tokenizer . encode_plus ( sequence , max_length = maximum_length , padding = " max_length " )
for sequence in sequences
]
encoded_sequences_batch_padded = tokenizer . batch_encode_plus ( sequences , padding = True )
self . assertListEqual (
encoded_sequences_padded ,
self . convert_batch_encode_plus_format_to_encode_plus ( encoded_sequences_batch_padded ) ,
)
# check 'longest' is unsensitive to a max length
encoded_sequences_batch_padded_1 = tokenizer . batch_encode_plus ( sequences , padding = True )
encoded_sequences_batch_padded_2 = tokenizer . batch_encode_plus (
sequences , max_length = maximum_length + 10 , padding = " longest "
)
for key in encoded_sequences_batch_padded_1 . keys ( ) :
self . assertListEqual (
encoded_sequences_batch_padded_1 [ key ] , encoded_sequences_batch_padded_2 [ key ] ,
)
# check 'no_padding' is unsensitive to a max length
encoded_sequences_batch_padded_1 = tokenizer . batch_encode_plus ( sequences , padding = False )
encoded_sequences_batch_padded_2 = tokenizer . batch_encode_plus (
sequences , max_length = maximum_length + 10 , padding = False
)
for key in encoded_sequences_batch_padded_1 . keys ( ) :
self . assertListEqual (
encoded_sequences_batch_padded_1 [ key ] , encoded_sequences_batch_padded_2 [ key ] ,
)
2020-02-25 01:09:46 +08:00
2020-08-11 16:49:16 +08:00
def test_added_token_serializable ( self ) :
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
new_token = AddedToken ( " new_token " , lstrip = True )
tokenizer . add_special_tokens ( { " additional_special_tokens " : [ new_token ] } )
with tempfile . TemporaryDirectory ( ) as tmp_dir_name :
tokenizer . save_pretrained ( tmp_dir_name )
tokenizer . from_pretrained ( tmp_dir_name )
2020-02-25 01:09:46 +08:00
def test_batch_encode_plus_padding ( self ) :
# Test that padded sequences are equivalent between batch_encode_plus and encode_plus
# Right padding tests
2020-06-16 05:12:51 +08:00
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
sequences = [
" Testing batch encode plus " ,
" Testing batch encode plus with different sequence lengths " ,
" Testing batch encode plus with different sequence lengths correctly pads " ,
]
max_length = 100
# check correct behaviour if no pad_token_id exists and add it eventually
self . _check_no_pad_token_padding ( tokenizer , sequences )
encoded_sequences = [
tokenizer . encode_plus ( sequence , max_length = max_length , padding = " max_length " )
for sequence in sequences
]
encoded_sequences_batch = tokenizer . batch_encode_plus (
sequences , max_length = max_length , padding = " max_length "
)
self . assertListEqual (
encoded_sequences , self . convert_batch_encode_plus_format_to_encode_plus ( encoded_sequences_batch )
)
2020-02-25 01:09:46 +08:00
# Left padding tests
2020-06-16 05:12:51 +08:00
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
tokenizer . padding_side = " left "
sequences = [
" Testing batch encode plus " ,
" Testing batch encode plus with different sequence lengths " ,
" Testing batch encode plus with different sequence lengths correctly pads " ,
]
max_length = 100
# check correct behaviour if no pad_token_id exists and add it eventually
self . _check_no_pad_token_padding ( tokenizer , sequences )
encoded_sequences = [
tokenizer . encode_plus ( sequence , max_length = max_length , padding = " max_length " )
for sequence in sequences
]
encoded_sequences_batch = tokenizer . batch_encode_plus (
sequences , max_length = max_length , padding = " max_length "
)
self . assertListEqual (
encoded_sequences , self . convert_batch_encode_plus_format_to_encode_plus ( encoded_sequences_batch )
)
def test_pretokenized_inputs ( self ) :
# Test when inputs are pretokenized
2020-06-23 19:36:57 +08:00
tokenizers = self . get_tokenizers ( do_lower_case = False ) # , add_prefix_space=True)
2020-06-16 05:12:51 +08:00
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
# Prepare a sequence from our tokenizer vocabulary
sequence , ids = self . get_clean_sequence ( tokenizer , with_prefix_space = True , max_length = 20 )
# sequence = " " + sequence # To be sure the byte-level tokenizers are feeling good
token_sequence = sequence . split ( )
# sequence_no_prefix_space = sequence.strip()
# Test encode for pretokenized inputs
output = tokenizer . encode ( token_sequence , is_pretokenized = True , add_special_tokens = False )
output_sequence = tokenizer . encode ( sequence , add_special_tokens = False )
self . assertEqual ( output , output_sequence )
output = tokenizer . encode ( token_sequence , is_pretokenized = True , add_special_tokens = True )
output_sequence = tokenizer . encode ( sequence , add_special_tokens = True )
self . assertEqual ( output , output_sequence )
# Test encode_plus for pretokenized inputs
output = tokenizer . encode_plus ( token_sequence , is_pretokenized = True , add_special_tokens = False )
output_sequence = tokenizer . encode_plus ( sequence , add_special_tokens = False )
for key in output . keys ( ) :
self . assertEqual ( output [ key ] , output_sequence [ key ] )
output = tokenizer . encode_plus ( token_sequence , is_pretokenized = True , add_special_tokens = True )
output_sequence = tokenizer . encode_plus ( sequence , add_special_tokens = True )
for key in output . keys ( ) :
self . assertEqual ( output [ key ] , output_sequence [ key ] )
# Test batch_encode_plus for pretokenized inputs
sequence_batch = [ sequence . strip ( ) ] * 2 + [ sequence . strip ( ) + " " + sequence . strip ( ) ]
token_sequence_batch = [ s . split ( ) for s in sequence_batch ]
sequence_batch_cleaned_up_spaces = [ " " + " " . join ( s ) for s in token_sequence_batch ]
output = tokenizer . batch_encode_plus (
token_sequence_batch , is_pretokenized = True , add_special_tokens = False
)
output_sequence = tokenizer . batch_encode_plus (
sequence_batch_cleaned_up_spaces , add_special_tokens = False
)
for key in output . keys ( ) :
self . assertEqual ( output [ key ] , output_sequence [ key ] )
output = tokenizer . batch_encode_plus (
token_sequence_batch , is_pretokenized = True , add_special_tokens = True
)
output_sequence = tokenizer . batch_encode_plus (
sequence_batch_cleaned_up_spaces , add_special_tokens = True
)
for key in output . keys ( ) :
self . assertEqual ( output [ key ] , output_sequence [ key ] )
# Test encode for pretokenized inputs pairs
output = tokenizer . encode (
token_sequence , token_sequence , is_pretokenized = True , add_special_tokens = False
)
output_sequence = tokenizer . encode ( sequence , sequence , add_special_tokens = False )
self . assertEqual ( output , output_sequence )
output = tokenizer . encode (
token_sequence , token_sequence , is_pretokenized = True , add_special_tokens = True
)
output_sequence = tokenizer . encode ( sequence , sequence , add_special_tokens = True )
self . assertEqual ( output , output_sequence )
# Test encode_plus for pretokenized inputs pairs
output = tokenizer . encode_plus (
token_sequence , token_sequence , is_pretokenized = True , add_special_tokens = False
)
output_sequence = tokenizer . encode_plus ( sequence , sequence , add_special_tokens = False )
for key in output . keys ( ) :
self . assertEqual ( output [ key ] , output_sequence [ key ] )
output = tokenizer . encode_plus (
token_sequence , token_sequence , is_pretokenized = True , add_special_tokens = True
)
output_sequence = tokenizer . encode_plus ( sequence , sequence , add_special_tokens = True )
for key in output . keys ( ) :
self . assertEqual ( output [ key ] , output_sequence [ key ] )
# Test batch_encode_plus for pretokenized inputs pairs
sequence_pair_batch = [ ( sequence . strip ( ) , sequence . strip ( ) ) ] * 2 + [
( sequence . strip ( ) + " " + sequence . strip ( ) , sequence . strip ( ) )
]
token_sequence_pair_batch = [ tuple ( s . split ( ) for s in pair ) for pair in sequence_pair_batch ]
sequence_pair_batch_cleaned_up_spaces = [
tuple ( " " + " " . join ( s ) for s in pair ) for pair in token_sequence_pair_batch
]
output = tokenizer . batch_encode_plus (
token_sequence_pair_batch , is_pretokenized = True , add_special_tokens = False
)
output_sequence = tokenizer . batch_encode_plus (
sequence_pair_batch_cleaned_up_spaces , add_special_tokens = False
)
for key in output . keys ( ) :
self . assertEqual ( output [ key ] , output_sequence [ key ] )
output = tokenizer . batch_encode_plus (
token_sequence_pair_batch , is_pretokenized = True , add_special_tokens = True
)
output_sequence = tokenizer . batch_encode_plus (
sequence_pair_batch_cleaned_up_spaces , add_special_tokens = True
)
for key in output . keys ( ) :
self . assertEqual ( output [ key ] , output_sequence [ key ] )
2020-02-25 01:09:46 +08:00
2020-07-03 22:51:21 +08:00
def test_prepare_for_model ( self ) :
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
string_sequence = " Testing the prepare_for_model method. "
ids = tokenizer . encode ( string_sequence , add_special_tokens = False )
input_dict = tokenizer . encode_plus ( string_sequence )
prepared_input_dict = tokenizer . prepare_for_model ( ids )
self . assertEqual ( input_dict , prepared_input_dict )
2020-02-25 01:09:46 +08:00
@require_torch
@require_tf
def test_batch_encode_plus_tensors ( self ) :
2020-06-16 05:12:51 +08:00
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
sequences = [
" Testing batch encode plus " ,
" Testing batch encode plus with different sequence lengths " ,
" Testing batch encode plus with different sequence lengths correctly pads " ,
]
# A Tensor cannot be build by sequences which are not the same size
self . assertRaises ( ValueError , tokenizer . batch_encode_plus , sequences , return_tensors = " pt " )
self . assertRaises ( ValueError , tokenizer . batch_encode_plus , sequences , return_tensors = " tf " )
if tokenizer . pad_token_id is None :
self . assertRaises (
ValueError , tokenizer . batch_encode_plus , sequences , padding = True , return_tensors = " pt " ,
)
self . assertRaises (
ValueError , tokenizer . batch_encode_plus , sequences , padding = " longest " , return_tensors = " tf " ,
)
else :
pytorch_tensor = tokenizer . batch_encode_plus ( sequences , padding = True , return_tensors = " pt " )
tensorflow_tensor = tokenizer . batch_encode_plus ( sequences , padding = " longest " , return_tensors = " tf " )
encoded_sequences = tokenizer . batch_encode_plus ( sequences , padding = True )
2020-02-25 01:09:46 +08:00
2020-06-16 05:12:51 +08:00
for key in encoded_sequences . keys ( ) :
pytorch_value = pytorch_tensor [ key ] . tolist ( )
tensorflow_value = tensorflow_tensor [ key ] . numpy ( ) . tolist ( )
encoded_value = encoded_sequences [ key ]
2020-02-25 01:09:46 +08:00
2020-06-16 05:12:51 +08:00
self . assertEqual ( pytorch_value , tensorflow_value , encoded_value )
2020-03-02 23:53:55 +08:00
def _check_no_pad_token_padding ( self , tokenizer , sequences ) :
# if tokenizer does not have pad_token_id, an error should be thrown
if tokenizer . pad_token_id is None :
with self . assertRaises ( ValueError ) :
if isinstance ( sequences , list ) :
2020-06-16 05:12:51 +08:00
tokenizer . batch_encode_plus ( sequences , padding = " longest " )
2020-03-02 23:53:55 +08:00
else :
2020-06-16 05:12:51 +08:00
tokenizer . encode_plus ( sequences , padding = True )
2020-03-02 23:53:55 +08:00
# add pad_token_id to pass subsequent tests
tokenizer . add_special_tokens ( { " pad_token " : " <PAD> " } )
2020-04-09 04:22:44 +08:00
2020-06-25 23:24:28 +08:00
@slow
2020-04-09 04:22:44 +08:00
@require_torch
def test_torch_encode_plus_sent_to_model ( self ) :
2020-05-19 22:46:55 +08:00
import torch
2020-08-24 23:03:01 +08:00
2020-04-09 04:22:44 +08:00
from transformers import MODEL_MAPPING , TOKENIZER_MAPPING
MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings ( MODEL_MAPPING , TOKENIZER_MAPPING )
2020-06-16 05:12:51 +08:00
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
2020-04-09 04:22:44 +08:00
2020-06-16 05:12:51 +08:00
if tokenizer . __class__ not in MODEL_TOKENIZER_MAPPING :
return
2020-04-09 04:22:44 +08:00
2020-06-16 05:12:51 +08:00
config_class , model_class = MODEL_TOKENIZER_MAPPING [ tokenizer . __class__ ]
config = config_class ( )
2020-04-09 04:22:44 +08:00
2020-06-16 05:12:51 +08:00
if config . is_encoder_decoder or config . pad_token_id is None :
return
2020-04-09 04:22:44 +08:00
2020-06-16 05:12:51 +08:00
model = model_class ( config )
2020-04-09 04:22:44 +08:00
2020-06-16 05:12:51 +08:00
# Make sure the model contains at least the full vocabulary size in its embedding matrix
is_using_common_embeddings = hasattr ( model . get_input_embeddings ( ) , " weight " )
assert (
( model . get_input_embeddings ( ) . weight . shape [ 0 ] > = len ( tokenizer ) )
if is_using_common_embeddings
else True
)
2020-04-09 04:22:44 +08:00
2020-06-16 05:12:51 +08:00
# Build sequence
first_ten_tokens = list ( tokenizer . get_vocab ( ) . keys ( ) ) [ : 10 ]
sequence = " " . join ( first_ten_tokens )
encoded_sequence = tokenizer . encode_plus ( sequence , return_tensors = " pt " )
batch_encoded_sequence = tokenizer . batch_encode_plus ( [ sequence , sequence ] , return_tensors = " pt " )
# This should not fail
2020-05-19 22:46:55 +08:00
2020-06-16 05:12:51 +08:00
with torch . no_grad ( ) : # saves some time
model ( * * encoded_sequence )
model ( * * batch_encoded_sequence )
2020-04-09 04:22:44 +08:00
2020-06-16 05:12:51 +08:00
# if self.test_rust_tokenizer:
# fast_tokenizer = self.get_rust_tokenizer()
# encoded_sequence_fast = fast_tokenizer.encode_plus(sequence, return_tensors="pt")
# batch_encoded_sequence_fast = fast_tokenizer.batch_encode_plus([sequence, sequence], return_tensors="pt")
# # This should not fail
# model(**encoded_sequence_fast)
# model(**batch_encoded_sequence_fast)
2020-04-09 04:22:44 +08:00
2020-06-25 23:24:28 +08:00
@slow
2020-04-09 04:22:44 +08:00
@require_tf
def test_tf_encode_plus_sent_to_model ( self ) :
from transformers import TF_MODEL_MAPPING , TOKENIZER_MAPPING
MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings ( TF_MODEL_MAPPING , TOKENIZER_MAPPING )
2020-06-16 05:12:51 +08:00
tokenizers = self . get_tokenizers ( do_lower_case = False )
for tokenizer in tokenizers :
with self . subTest ( f " { tokenizer . __class__ . __name__ } " ) :
if tokenizer . __class__ not in MODEL_TOKENIZER_MAPPING :
return
2020-04-09 04:22:44 +08:00
2020-06-16 05:12:51 +08:00
config_class , model_class = MODEL_TOKENIZER_MAPPING [ tokenizer . __class__ ]
config = config_class ( )
2020-04-09 04:22:44 +08:00
2020-06-16 05:12:51 +08:00
if config . is_encoder_decoder or config . pad_token_id is None :
return
2020-04-09 04:22:44 +08:00
2020-06-16 05:12:51 +08:00
model = model_class ( config )
2020-04-09 04:22:44 +08:00
2020-06-16 05:12:51 +08:00
# Make sure the model contains at least the full vocabulary size in its embedding matrix
assert model . config . vocab_size > = len ( tokenizer )
2020-04-09 04:22:44 +08:00
2020-06-16 05:12:51 +08:00
# Build sequence
first_ten_tokens = list ( tokenizer . get_vocab ( ) . keys ( ) ) [ : 10 ]
sequence = " " . join ( first_ten_tokens )
encoded_sequence = tokenizer . encode_plus ( sequence , return_tensors = " tf " )
batch_encoded_sequence = tokenizer . batch_encode_plus ( [ sequence , sequence ] , return_tensors = " tf " )
2020-04-09 04:22:44 +08:00
2020-06-16 05:12:51 +08:00
# This should not fail
model ( encoded_sequence )
model ( batch_encoded_sequence )
2020-06-04 12:57:01 +08:00
# TODO: Check if require_torch is the best to test for numpy here ... Maybe move to require_flax when available
2020-06-25 23:24:28 +08:00
@slow
2020-06-04 12:57:01 +08:00
@require_torch
def test_np_encode_plus_sent_to_model ( self ) :
from transformers import MODEL_MAPPING , TOKENIZER_MAPPING
MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings ( MODEL_MAPPING , TOKENIZER_MAPPING )
tokenizer = self . get_tokenizer ( )
if tokenizer . __class__ not in MODEL_TOKENIZER_MAPPING :
return
config_class , model_class = MODEL_TOKENIZER_MAPPING [ tokenizer . __class__ ]
config = config_class ( )
if config . is_encoder_decoder or config . pad_token_id is None :
return
# Build sequence
first_ten_tokens = list ( tokenizer . get_vocab ( ) . keys ( ) ) [ : 10 ]
sequence = " " . join ( first_ten_tokens )
encoded_sequence = tokenizer . encode_plus ( sequence , return_tensors = " np " )
batch_encoded_sequence = tokenizer . batch_encode_plus ( [ sequence , sequence ] , return_tensors = " np " )
# TODO: add forward through JAX/Flax when PR is merged
# This is currently here to make flake8 happy !
if encoded_sequence is None :
raise ValueError ( " Cannot convert list to numpy tensor on encode_plus() " )
if batch_encoded_sequence is None :
raise ValueError ( " Cannot convert list to numpy tensor on batch_encode_plus() " )
if self . test_rust_tokenizer :
fast_tokenizer = self . get_rust_tokenizer ( )
encoded_sequence_fast = fast_tokenizer . encode_plus ( sequence , return_tensors = " np " )
batch_encoded_sequence_fast = fast_tokenizer . batch_encode_plus ( [ sequence , sequence ] , return_tensors = " np " )
# TODO: add forward through JAX/Flax when PR is merged
# This is currently here to make flake8 happy !
if encoded_sequence_fast is None :
raise ValueError ( " Cannot convert list to numpy tensor on encode_plus() (fast) " )
if batch_encoded_sequence_fast is None :
raise ValueError ( " Cannot convert list to numpy tensor on batch_encode_plus() (fast) " )
2020-08-12 03:57:07 +08:00
@require_torch
def test_prepare_seq2seq_batch ( self ) :
tokenizer = self . get_tokenizer ( )
if not hasattr ( tokenizer , " prepare_seq2seq_batch " ) :
return
# Longer text that will definitely require truncation.
src_text = [
" UN Chief Says There Is No Military Solution in Syria " ,
" Secretary-General Ban Ki-moon says his response to Russia ' s stepped up military support for Syria is that ' there is no military solution ' to the nearly five-year conflict and more weapons will only worsen the violence and misery for millions of people. " ,
]
tgt_text = [
" Şeful ONU declară că nu există o soluţie militară în Siria " ,
" Secretarul General Ban Ki-moon declară că răspunsul său la intensificarea sprijinului militar al Rusiei "
' pentru Siria este că " nu există o soluţie militară " la conflictul de aproape cinci ani şi că noi arme nu '
" vor face decât să înrăutăţească violenţele şi mizeria pentru milioane de oameni. " ,
]
batch = tokenizer . prepare_seq2seq_batch (
src_texts = src_text , tgt_texts = tgt_text , max_length = 3 , max_target_length = 10 , return_tensors = " pt "
)
self . assertEqual ( batch . input_ids . shape [ 1 ] , 3 )
self . assertEqual ( batch . decoder_input_ids . shape [ 1 ] , 10 )
# max_target_length will default to max_length if not specified
batch = tokenizer . prepare_seq2seq_batch ( src_text , tgt_texts = tgt_text , max_length = 3 )
self . assertEqual ( batch . input_ids . shape [ 1 ] , 3 )
self . assertEqual ( batch . decoder_input_ids . shape [ 1 ] , 3 )
batch_encoder_only = tokenizer . prepare_seq2seq_batch (
src_texts = src_text , max_length = 3 , max_target_length = 10 , return_tensors = " pt "
)
self . assertEqual ( batch_encoder_only . input_ids . shape [ 1 ] , 3 )
self . assertEqual ( batch_encoder_only . attention_mask . shape [ 1 ] , 3 )
self . assertNotIn ( " decoder_input_ids " , batch_encoder_only )