Compare commits
5 Commits
main
...
fix-added-
Author | SHA1 | Date |
---|---|---|
Arthur Zucker | d662f59f40 | |
Arthur Zucker | 478715f61f | |
Arthur Zucker | e4677fcb63 | |
Arthur Zucker | 6092ee8388 | |
Arthur Zucker | 314a529e6c |
|
@ -999,9 +999,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
|
|||
self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
|
||||
|
||||
filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
|
||||
legacy_added_tokens = set(self._added_tokens_encoder.keys()) - set(self.all_special_tokens) | {
|
||||
token for token in self.additional_special_tokens if self.convert_tokens_to_ids(token) >= self.vocab_size
|
||||
}
|
||||
encoder_keys = self._added_tokens_encoder.keys()
|
||||
# To avoid mixing byte-level and unicode for byte-level BPT
|
||||
# we need to build string separately for added tokens and byte-level tokens
|
||||
# cf. https://github.com/huggingface/transformers/issues/1133
|
||||
|
@ -1011,7 +1009,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
|
|||
for token in filtered_tokens:
|
||||
if skip_special_tokens and token in self.all_special_ids:
|
||||
continue
|
||||
if token in legacy_added_tokens:
|
||||
if token in encoder_keys:
|
||||
if current_sub_text:
|
||||
string = self.convert_tokens_to_string(current_sub_text)
|
||||
if len(string) > 0:
|
||||
|
@ -1022,11 +1020,9 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
|
|||
current_sub_text.append(token)
|
||||
if current_sub_text:
|
||||
sub_texts.append(self.convert_tokens_to_string(current_sub_text))
|
||||
|
||||
if spaces_between_special_tokens:
|
||||
text = " ".join(sub_texts)
|
||||
else:
|
||||
text = "".join(sub_texts)
|
||||
|
||||
# we do not add spaces between special tokens. If you want some, add them yourself.. .
|
||||
text = "".join(sub_texts)
|
||||
|
||||
clean_up_tokenization_spaces = (
|
||||
clean_up_tokenization_spaces
|
||||
|
|
|
@ -154,42 +154,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
|
|||
|
||||
# We call this after having initialized the backend tokenizer because we update it.
|
||||
super().__init__(**kwargs)
|
||||
|
||||
# The following logic will be replace with a single add_tokens once a fix is pushed to tokenizers
|
||||
# allows converting a slow -> fast, non-legacy: if the `tokenizer.json` does not have all the added tokens
|
||||
# uses the information stored in `added_tokens_decoder`.
|
||||
# this is costly for fast tokenizers as we re-compute the regex again. But not all tokens are added tokens
|
||||
tokens_to_add = [
|
||||
token
|
||||
for index, token in sorted(added_tokens_decoder.items(), key=lambda x: x[0])
|
||||
if token not in self.added_tokens_decoder
|
||||
]
|
||||
encoder = list(self.added_tokens_encoder.keys()) + [str(token) for token in tokens_to_add]
|
||||
# if some of the special tokens are strings, we check if we don't already have a token
|
||||
tokens_to_add += [
|
||||
token for token in self.all_special_tokens_extended if token not in encoder and token not in tokens_to_add
|
||||
]
|
||||
if len(tokens_to_add) > 0:
|
||||
# super hack: if a token.special is set, tokenizer ignores it for now so FIXME @ArthurZ
|
||||
# Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for
|
||||
# individual tokens would repeatedly rebuild a trie, which can be slow.
|
||||
is_last_special = None
|
||||
tokens = []
|
||||
special_tokens = self.all_special_tokens
|
||||
for token in tokens_to_add:
|
||||
is_special = (
|
||||
(token.special or str(token) in special_tokens)
|
||||
if isinstance(token, AddedToken)
|
||||
else str(token) in special_tokens
|
||||
)
|
||||
if is_last_special is None or is_last_special == is_special:
|
||||
tokens.append(token)
|
||||
else:
|
||||
self._add_tokens(tokens, special_tokens=is_last_special)
|
||||
tokens = [token]
|
||||
is_last_special = is_special
|
||||
if tokens:
|
||||
self._add_tokens(tokens, special_tokens=is_last_special)
|
||||
self._add_tokens(list(added_tokens_decoder.values()))
|
||||
|
||||
@property
|
||||
def is_fast(self) -> bool:
|
||||
|
|
|
@ -4119,51 +4119,7 @@ class TokenizerTesterMixin:
|
|||
# Should not raise an error
|
||||
self.rust_tokenizer_class.from_pretrained(tmp_dir_2)
|
||||
|
||||
# TODO This is ran for all models but only tests bert...
|
||||
def test_clean_up_tokenization_spaces(self):
|
||||
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
|
||||
assert tokenizer.clean_up_tokenization_spaces is True
|
||||
|
||||
tokens = tokenizer.encode("This shouldn't be! He'll go.")
|
||||
decoded = tokenizer.decode(tokens)
|
||||
assert decoded == "[CLS] this shouldn't be! he'll go. [SEP]"
|
||||
|
||||
tokenizer.clean_up_tokenization_spaces = False
|
||||
decoded = tokenizer.decode(tokens)
|
||||
assert decoded == "[CLS] this shouldn ' t be ! he ' ll go . [SEP]"
|
||||
assert decoded == tokenizer.decode(tokens, clean_up_tokenization_spaces=False)
|
||||
|
||||
# Fast from slow
|
||||
with tempfile.TemporaryDirectory() as tmp_dir_2:
|
||||
tokenizer.save_pretrained(tmp_dir_2)
|
||||
tokenizer_fast = BertTokenizerFast.from_pretrained(tmp_dir_2)
|
||||
del tokenizer
|
||||
|
||||
assert tokenizer_fast.clean_up_tokenization_spaces is False
|
||||
decoded = tokenizer_fast.decode(tokens)
|
||||
# fast and slow don't have the same output when we don't cleanup
|
||||
# tokenization space. Here `be!` vs `be !` and `go.` vs `go .`
|
||||
assert decoded == "[CLS] this shouldn ' t be! he ' ll go. [SEP]"
|
||||
|
||||
tokenizer_fast.clean_up_tokenization_spaces = True
|
||||
assert tokenizer_fast.clean_up_tokenization_spaces is True
|
||||
|
||||
decoded = tokenizer_fast.decode(tokens)
|
||||
assert decoded == "[CLS] this shouldn't be! he'll go. [SEP]"
|
||||
|
||||
# Slow from fast
|
||||
with tempfile.TemporaryDirectory() as tmp_dir_2:
|
||||
tokenizer_fast.clean_up_tokenization_spaces = False
|
||||
tokenizer_fast.save_pretrained(tmp_dir_2)
|
||||
tokenizer = BertTokenizer.from_pretrained(tmp_dir_2)
|
||||
|
||||
assert tokenizer.clean_up_tokenization_spaces is False
|
||||
decoded = tokenizer.decode(tokens)
|
||||
assert decoded == "[CLS] this shouldn ' t be ! he ' ll go . [SEP]"
|
||||
|
||||
tokenizer.clean_up_tokenization_spaces = True
|
||||
decoded = tokenizer.decode(tokens)
|
||||
assert decoded == "[CLS] this shouldn't be! he'll go. [SEP]"
|
||||
|
||||
def test_split_special_tokens(self):
|
||||
if not self.test_slow_tokenizer:
|
||||
|
|
|
@ -274,3 +274,49 @@ class TrieTest(unittest.TestCase):
|
|||
trie = Trie()
|
||||
parts = trie.cut_text("ABC", [0, 0, 2, 1, 2, 3])
|
||||
self.assertEqual(parts, ["AB", "C"])
|
||||
|
||||
def test_clean_up_tokenization_spaces(self):
|
||||
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
|
||||
tokenizer.spaces_between_special_tokens = True
|
||||
assert tokenizer.clean_up_tokenization_spaces is True
|
||||
|
||||
tokens = tokenizer.encode("This shouldn't be! He'll go.")
|
||||
decoded = tokenizer.decode(tokens)
|
||||
assert decoded == "[CLS] this shouldn't be! he'll go. [SEP]"
|
||||
|
||||
tokenizer.clean_up_tokenization_spaces = False
|
||||
decoded = tokenizer.decode(tokens)
|
||||
assert decoded == "[CLS] this shouldn ' t be ! he ' ll go . [SEP]"
|
||||
assert decoded == tokenizer.decode(tokens, clean_up_tokenization_spaces=False)
|
||||
|
||||
# Fast from slow
|
||||
with tempfile.TemporaryDirectory() as tmp_dir_2:
|
||||
tokenizer.save_pretrained(tmp_dir_2)
|
||||
tokenizer_fast = BertTokenizerFast.from_pretrained(tmp_dir_2)
|
||||
del tokenizer
|
||||
|
||||
assert tokenizer_fast.clean_up_tokenization_spaces is False
|
||||
decoded = tokenizer_fast.decode(tokens, spaces_between_special_tokens=True)
|
||||
# fast and slow don't have the same output when we don't cleanup
|
||||
# tokenization space. Here `be!` vs `be !` and `go.` vs `go .`
|
||||
assert decoded == "[CLS]this shouldn ' t be! he ' ll go.[SEP]"
|
||||
|
||||
tokenizer_fast.clean_up_tokenization_spaces = True
|
||||
assert tokenizer_fast.clean_up_tokenization_spaces is True
|
||||
|
||||
decoded = tokenizer_fast.decode(tokens)
|
||||
assert decoded == "[CLS]this shouldn't be! he'll go.[SEP]"
|
||||
|
||||
# Slow from fast
|
||||
with tempfile.TemporaryDirectory() as tmp_dir_2:
|
||||
tokenizer_fast.clean_up_tokenization_spaces = False
|
||||
tokenizer_fast.save_pretrained(tmp_dir_2)
|
||||
tokenizer = BertTokenizer.from_pretrained(tmp_dir_2)
|
||||
|
||||
assert tokenizer.clean_up_tokenization_spaces is False
|
||||
decoded = tokenizer.decode(tokens)
|
||||
assert decoded == "[CLS]this shouldn ' t be ! he ' ll go .[SEP]"
|
||||
|
||||
tokenizer.clean_up_tokenization_spaces = True
|
||||
decoded = tokenizer.decode(tokens)
|
||||
assert decoded == "[CLS]this shouldn't be! he'll go.[SEP]"
|
Loading…
Reference in New Issue