From 07ba6e11fcc88cfc364412be167f7ad2d8d411b0 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Wed, 17 Apr 2024 10:14:51 +0200 Subject: [PATCH] revert plus 0.19 --- setup.py | 2 +- src/transformers/dependency_versions_table.py | 2 +- .../models/cohere/tokenization_cohere_fast.py | 16 +++++++++++----- .../models/deberta/tokenization_deberta_fast.py | 13 +++++++------ .../models/gpt2/tokenization_gpt2_fast.py | 12 ++++++------ .../gpt_neox/tokenization_gpt_neox_fast.py | 13 +++++++------ .../layoutlmv3/tokenization_layoutlmv3_fast.py | 7 +------ .../models/led/tokenization_led_fast.py | 13 +++++++------ .../longformer/tokenization_longformer_fast.py | 5 ----- .../markuplm/tokenization_markuplm_fast.py | 13 +++++++------ .../models/mvp/tokenization_mvp_fast.py | 8 ++------ .../models/roberta/tokenization_roberta_fast.py | 13 +++++++------ 12 files changed, 57 insertions(+), 60 deletions(-) diff --git a/setup.py b/setup.py index 97e32f1288..5ee104129d 100644 --- a/setup.py +++ b/setup.py @@ -174,7 +174,7 @@ _deps = [ "tf2onnx", "timeout-decorator", "timm", - "tokenizers>=0.16.0rc1", + "tokenizers>=0.19.0rc0", "torch", "torchaudio", "torchvision", diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index a150e6337d..8a6976d50b 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -79,7 +79,7 @@ deps = { "tf2onnx": "tf2onnx", "timeout-decorator": "timeout-decorator", "timm": "timm", - "tokenizers": "tokenizers>=0.16.0rc1", + "tokenizers": "tokenizers>=0.19.0rc0", "torch": "torch", "torchaudio": "torchaudio", "torchvision": "torchvision", diff --git a/src/transformers/models/cohere/tokenization_cohere_fast.py b/src/transformers/models/cohere/tokenization_cohere_fast.py index 5968374874..e733a6dfd0 100644 --- a/src/transformers/models/cohere/tokenization_cohere_fast.py +++ b/src/transformers/models/cohere/tokenization_cohere_fast.py @@ -129,11 +129,6 @@ class CohereTokenizerFast(PreTrainedTokenizerFast): add_prefix_space=False, **kwargs, ): - if add_prefix_space is not None: - kwargs["from_slow"] = True - else: - add_prefix_space = False - super().__init__( vocab_file=vocab_file, merges_file=merges_file, @@ -156,6 +151,17 @@ class CohereTokenizerFast(PreTrainedTokenizerFast): self.grounded_generation_template = kwargs.pop("grounded_generation_template", None) self.tool_use_template = kwargs.pop("tool_use_template", None) + # TODO @ArthurZucker this can only work one way for now, to update later-on. Tests should also properly + # check this as they were green before. + pre_tok_state = pickle.dumps(self.backend_tokenizer.pre_tokenizer) + decoder_state = pickle.dumps(self.backend_tokenizer.decoder) + + if add_prefix_space: + pre_tok_state = pre_tok_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true') + decoder_state = decoder_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true') + self.backend_tokenizer.pre_tokenizer = pickle.loads(pre_tok_state) + self.backend_tokenizer.decoder = pickle.loads(decoder_state) + self.add_prefix_space = add_prefix_space def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding: diff --git a/src/transformers/models/deberta/tokenization_deberta_fast.py b/src/transformers/models/deberta/tokenization_deberta_fast.py index cac9e1d77f..07226443d3 100644 --- a/src/transformers/models/deberta/tokenization_deberta_fast.py +++ b/src/transformers/models/deberta/tokenization_deberta_fast.py @@ -112,14 +112,9 @@ class DebertaTokenizerFast(PreTrainedTokenizerFast): unk_token="[UNK]", pad_token="[PAD]", mask_token="[MASK]", - add_prefix_space=None, + add_prefix_space=False, **kwargs, ): - if add_prefix_space is not None: - kwargs["from_slow"] = True - else: - add_prefix_space = False - super().__init__( vocab_file, merges_file, @@ -137,6 +132,12 @@ class DebertaTokenizerFast(PreTrainedTokenizerFast): ) self.add_bos_token = kwargs.pop("add_bos_token", False) + pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__()) + if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space: + pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type")) + pre_tok_state["add_prefix_space"] = add_prefix_space + self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state) + self.add_prefix_space = add_prefix_space @property diff --git a/src/transformers/models/gpt2/tokenization_gpt2_fast.py b/src/transformers/models/gpt2/tokenization_gpt2_fast.py index 2b7086db35..fb3a5d4a0c 100644 --- a/src/transformers/models/gpt2/tokenization_gpt2_fast.py +++ b/src/transformers/models/gpt2/tokenization_gpt2_fast.py @@ -94,14 +94,9 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast): unk_token="<|endoftext|>", bos_token="<|endoftext|>", eos_token="<|endoftext|>", - add_prefix_space=None, + add_prefix_space=False, **kwargs, ): - if add_prefix_space is not None: - kwargs["from_slow"] = True - else: - add_prefix_space = False - super().__init__( vocab_file, merges_file, @@ -115,6 +110,11 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast): self.add_bos_token = kwargs.pop("add_bos_token", False) + pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__()) + if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space: + pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type")) + pre_tok_state["add_prefix_space"] = add_prefix_space + self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state) self.add_prefix_space = add_prefix_space diff --git a/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py b/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py index f8a0a3f64d..dceb512e8f 100644 --- a/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py +++ b/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py @@ -91,14 +91,9 @@ class GPTNeoXTokenizerFast(PreTrainedTokenizerFast): unk_token="<|endoftext|>", bos_token="<|endoftext|>", eos_token="<|endoftext|>", - add_prefix_space=None, + add_prefix_space=False, **kwargs, ): - if add_prefix_space is not None: - kwargs["from_slow"] = True - else: - add_prefix_space = False - super().__init__( vocab_file, merges_file, @@ -110,6 +105,12 @@ class GPTNeoXTokenizerFast(PreTrainedTokenizerFast): **kwargs, ) + pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__()) + if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space: + pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type")) + pre_tok_state["add_prefix_space"] = add_prefix_space + self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state) + self.add_prefix_space = add_prefix_space def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: diff --git a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py index d7f973bdb7..61dc91848c 100644 --- a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py +++ b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py @@ -131,7 +131,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast): unk_token="", pad_token="", mask_token="", - add_prefix_space=None, + add_prefix_space=True, trim_offsets=True, cls_token_box=[0, 0, 0, 0], sep_token_box=[0, 0, 0, 0], @@ -140,11 +140,6 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast): only_label_first_subword=True, **kwargs, ): - if add_prefix_space is not None: - kwargs["from_slow"] = True - else: - add_prefix_space = True - super().__init__( vocab_file, merges_file, diff --git a/src/transformers/models/led/tokenization_led_fast.py b/src/transformers/models/led/tokenization_led_fast.py index fe86a7436e..ca15eb997b 100644 --- a/src/transformers/models/led/tokenization_led_fast.py +++ b/src/transformers/models/led/tokenization_led_fast.py @@ -130,15 +130,10 @@ class LEDTokenizerFast(PreTrainedTokenizerFast): unk_token="", pad_token="", mask_token="", - add_prefix_space=None, + add_prefix_space=False, trim_offsets=True, **kwargs, ): - if add_prefix_space is not None: - kwargs["from_slow"] = True - else: - add_prefix_space = False - # we have to specify that this tokens is special otherwise adding it will reset the normalized flag to `False` in `add_special_tokens` mask_token = ( AddedToken(mask_token, lstrip=True, normalized=True, special=True) @@ -162,6 +157,12 @@ class LEDTokenizerFast(PreTrainedTokenizerFast): **kwargs, ) + pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__()) + if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space: + pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type")) + pre_tok_state["add_prefix_space"] = add_prefix_space + self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state) + self.add_prefix_space = add_prefix_space # the pre_tokenizer is already updated in the GPT2TokenizerFast `__init__` diff --git a/src/transformers/models/longformer/tokenization_longformer_fast.py b/src/transformers/models/longformer/tokenization_longformer_fast.py index a478aae05a..a6f160ac07 100644 --- a/src/transformers/models/longformer/tokenization_longformer_fast.py +++ b/src/transformers/models/longformer/tokenization_longformer_fast.py @@ -132,11 +132,6 @@ class LongformerTokenizerFast(PreTrainedTokenizerFast): trim_offsets=True, **kwargs, ): - if add_prefix_space is not None: - kwargs["from_slow"] = True - else: - add_prefix_space = False - mask_token = ( AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False) if isinstance(mask_token, str) diff --git a/src/transformers/models/markuplm/tokenization_markuplm_fast.py b/src/transformers/models/markuplm/tokenization_markuplm_fast.py index 98d337a72f..ff0e4ffeb5 100644 --- a/src/transformers/models/markuplm/tokenization_markuplm_fast.py +++ b/src/transformers/models/markuplm/tokenization_markuplm_fast.py @@ -155,7 +155,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast): unk_token="", pad_token="", mask_token="", - add_prefix_space=None, + add_prefix_space=False, max_depth=50, max_width=1000, pad_width=1001, @@ -174,11 +174,6 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast): # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token - if add_prefix_space is not None: - kwargs["from_slow"] = True - else: - add_prefix_space = False - super().__init__( vocab_file=vocab_file, merges_file=merges_file, @@ -212,6 +207,12 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast): self.tags_dict = tags_dict + pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__()) + if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space: + pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type")) + pre_tok_state["add_prefix_space"] = add_prefix_space + self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state) + self.add_prefix_space = add_prefix_space tokenizer_component = "post_processor" diff --git a/src/transformers/models/mvp/tokenization_mvp_fast.py b/src/transformers/models/mvp/tokenization_mvp_fast.py index 139fbe0526..d391d27a7b 100644 --- a/src/transformers/models/mvp/tokenization_mvp_fast.py +++ b/src/transformers/models/mvp/tokenization_mvp_fast.py @@ -130,7 +130,7 @@ class MvpTokenizerFast(PreTrainedTokenizerFast): unk_token="", pad_token="", mask_token="", - add_prefix_space=None, + add_prefix_space=False, trim_offsets=True, **kwargs, ): @@ -143,12 +143,8 @@ class MvpTokenizerFast(PreTrainedTokenizerFast): # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token - - if add_prefix_space is not None: + if add_prefix_space: kwargs["from_slow"] = True - else: - add_prefix_space = False - super().__init__( vocab_file, merges_file, diff --git a/src/transformers/models/roberta/tokenization_roberta_fast.py b/src/transformers/models/roberta/tokenization_roberta_fast.py index 69e611984b..702af8a33e 100644 --- a/src/transformers/models/roberta/tokenization_roberta_fast.py +++ b/src/transformers/models/roberta/tokenization_roberta_fast.py @@ -127,7 +127,7 @@ class RobertaTokenizerFast(PreTrainedTokenizerFast): unk_token="", pad_token="", mask_token="", - add_prefix_space=None, + add_prefix_space=False, trim_offsets=True, **kwargs, ): @@ -136,11 +136,6 @@ class RobertaTokenizerFast(PreTrainedTokenizerFast): if isinstance(mask_token, str) else mask_token ) - if add_prefix_space is not None: - kwargs["from_slow"] = True - else: - add_prefix_space = False - super().__init__( vocab_file, merges_file, @@ -158,6 +153,12 @@ class RobertaTokenizerFast(PreTrainedTokenizerFast): **kwargs, ) + pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__()) + if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space: + pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type")) + pre_tok_state["add_prefix_space"] = add_prefix_space + self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state) + self.add_prefix_space = add_prefix_space tokenizer_component = "post_processor"