Compare commits

...

5 Commits

Author SHA1 Message Date
Arthur Zucker d662f59f40 update 2024-05-06 12:10:01 +02:00
Arthur Zucker 478715f61f Merge branch 'main' of github.com:huggingface/transformers into fix-added-tokens-fast 2024-05-06 12:03:03 +02:00
Arthur Zucker e4677fcb63 nits 2024-05-06 12:02:58 +02:00
Arthur Zucker 6092ee8388 update 2024-04-30 16:05:06 +02:00
Arthur Zucker 314a529e6c update based on tokenizers release 2024-04-30 15:55:38 +02:00
4 changed files with 52 additions and 89 deletions

View File

@ -999,9 +999,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
legacy_added_tokens = set(self._added_tokens_encoder.keys()) - set(self.all_special_tokens) | {
token for token in self.additional_special_tokens if self.convert_tokens_to_ids(token) >= self.vocab_size
}
encoder_keys = self._added_tokens_encoder.keys()
# To avoid mixing byte-level and unicode for byte-level BPT
# we need to build string separately for added tokens and byte-level tokens
# cf. https://github.com/huggingface/transformers/issues/1133
@ -1011,7 +1009,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
for token in filtered_tokens:
if skip_special_tokens and token in self.all_special_ids:
continue
if token in legacy_added_tokens:
if token in encoder_keys:
if current_sub_text:
string = self.convert_tokens_to_string(current_sub_text)
if len(string) > 0:
@ -1022,11 +1020,9 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
current_sub_text.append(token)
if current_sub_text:
sub_texts.append(self.convert_tokens_to_string(current_sub_text))
if spaces_between_special_tokens:
text = " ".join(sub_texts)
else:
text = "".join(sub_texts)
# we do not add spaces between special tokens. If you want some, add them yourself.. .
text = "".join(sub_texts)
clean_up_tokenization_spaces = (
clean_up_tokenization_spaces

View File

@ -154,42 +154,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
# We call this after having initialized the backend tokenizer because we update it.
super().__init__(**kwargs)
# The following logic will be replace with a single add_tokens once a fix is pushed to tokenizers
# allows converting a slow -> fast, non-legacy: if the `tokenizer.json` does not have all the added tokens
# uses the information stored in `added_tokens_decoder`.
# this is costly for fast tokenizers as we re-compute the regex again. But not all tokens are added tokens
tokens_to_add = [
token
for index, token in sorted(added_tokens_decoder.items(), key=lambda x: x[0])
if token not in self.added_tokens_decoder
]
encoder = list(self.added_tokens_encoder.keys()) + [str(token) for token in tokens_to_add]
# if some of the special tokens are strings, we check if we don't already have a token
tokens_to_add += [
token for token in self.all_special_tokens_extended if token not in encoder and token not in tokens_to_add
]
if len(tokens_to_add) > 0:
# super hack: if a token.special is set, tokenizer ignores it for now so FIXME @ArthurZ
# Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for
# individual tokens would repeatedly rebuild a trie, which can be slow.
is_last_special = None
tokens = []
special_tokens = self.all_special_tokens
for token in tokens_to_add:
is_special = (
(token.special or str(token) in special_tokens)
if isinstance(token, AddedToken)
else str(token) in special_tokens
)
if is_last_special is None or is_last_special == is_special:
tokens.append(token)
else:
self._add_tokens(tokens, special_tokens=is_last_special)
tokens = [token]
is_last_special = is_special
if tokens:
self._add_tokens(tokens, special_tokens=is_last_special)
self._add_tokens(list(added_tokens_decoder.values()))
@property
def is_fast(self) -> bool:

View File

@ -4119,51 +4119,7 @@ class TokenizerTesterMixin:
# Should not raise an error
self.rust_tokenizer_class.from_pretrained(tmp_dir_2)
# TODO This is ran for all models but only tests bert...
def test_clean_up_tokenization_spaces(self):
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
assert tokenizer.clean_up_tokenization_spaces is True
tokens = tokenizer.encode("This shouldn't be! He'll go.")
decoded = tokenizer.decode(tokens)
assert decoded == "[CLS] this shouldn't be! he'll go. [SEP]"
tokenizer.clean_up_tokenization_spaces = False
decoded = tokenizer.decode(tokens)
assert decoded == "[CLS] this shouldn ' t be ! he ' ll go . [SEP]"
assert decoded == tokenizer.decode(tokens, clean_up_tokenization_spaces=False)
# Fast from slow
with tempfile.TemporaryDirectory() as tmp_dir_2:
tokenizer.save_pretrained(tmp_dir_2)
tokenizer_fast = BertTokenizerFast.from_pretrained(tmp_dir_2)
del tokenizer
assert tokenizer_fast.clean_up_tokenization_spaces is False
decoded = tokenizer_fast.decode(tokens)
# fast and slow don't have the same output when we don't cleanup
# tokenization space. Here `be!` vs `be !` and `go.` vs `go .`
assert decoded == "[CLS] this shouldn ' t be! he ' ll go. [SEP]"
tokenizer_fast.clean_up_tokenization_spaces = True
assert tokenizer_fast.clean_up_tokenization_spaces is True
decoded = tokenizer_fast.decode(tokens)
assert decoded == "[CLS] this shouldn't be! he'll go. [SEP]"
# Slow from fast
with tempfile.TemporaryDirectory() as tmp_dir_2:
tokenizer_fast.clean_up_tokenization_spaces = False
tokenizer_fast.save_pretrained(tmp_dir_2)
tokenizer = BertTokenizer.from_pretrained(tmp_dir_2)
assert tokenizer.clean_up_tokenization_spaces is False
decoded = tokenizer.decode(tokens)
assert decoded == "[CLS] this shouldn ' t be ! he ' ll go . [SEP]"
tokenizer.clean_up_tokenization_spaces = True
decoded = tokenizer.decode(tokens)
assert decoded == "[CLS] this shouldn't be! he'll go. [SEP]"
def test_split_special_tokens(self):
if not self.test_slow_tokenizer:

View File

@ -274,3 +274,49 @@ class TrieTest(unittest.TestCase):
trie = Trie()
parts = trie.cut_text("ABC", [0, 0, 2, 1, 2, 3])
self.assertEqual(parts, ["AB", "C"])
def test_clean_up_tokenization_spaces(self):
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
tokenizer.spaces_between_special_tokens = True
assert tokenizer.clean_up_tokenization_spaces is True
tokens = tokenizer.encode("This shouldn't be! He'll go.")
decoded = tokenizer.decode(tokens)
assert decoded == "[CLS] this shouldn't be! he'll go. [SEP]"
tokenizer.clean_up_tokenization_spaces = False
decoded = tokenizer.decode(tokens)
assert decoded == "[CLS] this shouldn ' t be ! he ' ll go . [SEP]"
assert decoded == tokenizer.decode(tokens, clean_up_tokenization_spaces=False)
# Fast from slow
with tempfile.TemporaryDirectory() as tmp_dir_2:
tokenizer.save_pretrained(tmp_dir_2)
tokenizer_fast = BertTokenizerFast.from_pretrained(tmp_dir_2)
del tokenizer
assert tokenizer_fast.clean_up_tokenization_spaces is False
decoded = tokenizer_fast.decode(tokens, spaces_between_special_tokens=True)
# fast and slow don't have the same output when we don't cleanup
# tokenization space. Here `be!` vs `be !` and `go.` vs `go .`
assert decoded == "[CLS]this shouldn ' t be! he ' ll go.[SEP]"
tokenizer_fast.clean_up_tokenization_spaces = True
assert tokenizer_fast.clean_up_tokenization_spaces is True
decoded = tokenizer_fast.decode(tokens)
assert decoded == "[CLS]this shouldn't be! he'll go.[SEP]"
# Slow from fast
with tempfile.TemporaryDirectory() as tmp_dir_2:
tokenizer_fast.clean_up_tokenization_spaces = False
tokenizer_fast.save_pretrained(tmp_dir_2)
tokenizer = BertTokenizer.from_pretrained(tmp_dir_2)
assert tokenizer.clean_up_tokenization_spaces is False
decoded = tokenizer.decode(tokens)
assert decoded == "[CLS]this shouldn ' t be ! he ' ll go .[SEP]"
tokenizer.clean_up_tokenization_spaces = True
decoded = tokenizer.decode(tokens)
assert decoded == "[CLS]this shouldn't be! he'll go.[SEP]"