revert plus 0.19

This commit is contained in:
Arthur Zucker 2024-04-17 10:14:51 +02:00
parent 1b945739ae
commit 07ba6e11fc
12 changed files with 57 additions and 60 deletions

View File

@ -174,7 +174,7 @@ _deps = [
"tf2onnx",
"timeout-decorator",
"timm",
"tokenizers>=0.16.0rc1",
"tokenizers>=0.19.0rc0",
"torch",
"torchaudio",
"torchvision",

View File

@ -79,7 +79,7 @@ deps = {
"tf2onnx": "tf2onnx",
"timeout-decorator": "timeout-decorator",
"timm": "timm",
"tokenizers": "tokenizers>=0.16.0rc1",
"tokenizers": "tokenizers>=0.19.0rc0",
"torch": "torch",
"torchaudio": "torchaudio",
"torchvision": "torchvision",

View File

@ -129,11 +129,6 @@ class CohereTokenizerFast(PreTrainedTokenizerFast):
add_prefix_space=False,
**kwargs,
):
if add_prefix_space is not None:
kwargs["from_slow"] = True
else:
add_prefix_space = False
super().__init__(
vocab_file=vocab_file,
merges_file=merges_file,
@ -156,6 +151,17 @@ class CohereTokenizerFast(PreTrainedTokenizerFast):
self.grounded_generation_template = kwargs.pop("grounded_generation_template", None)
self.tool_use_template = kwargs.pop("tool_use_template", None)
# TODO @ArthurZucker this can only work one way for now, to update later-on. Tests should also properly
# check this as they were green before.
pre_tok_state = pickle.dumps(self.backend_tokenizer.pre_tokenizer)
decoder_state = pickle.dumps(self.backend_tokenizer.decoder)
if add_prefix_space:
pre_tok_state = pre_tok_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
decoder_state = decoder_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
self.backend_tokenizer.pre_tokenizer = pickle.loads(pre_tok_state)
self.backend_tokenizer.decoder = pickle.loads(decoder_state)
self.add_prefix_space = add_prefix_space
def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:

View File

@ -112,14 +112,9 @@ class DebertaTokenizerFast(PreTrainedTokenizerFast):
unk_token="[UNK]",
pad_token="[PAD]",
mask_token="[MASK]",
add_prefix_space=None,
add_prefix_space=False,
**kwargs,
):
if add_prefix_space is not None:
kwargs["from_slow"] = True
else:
add_prefix_space = False
super().__init__(
vocab_file,
merges_file,
@ -137,6 +132,12 @@ class DebertaTokenizerFast(PreTrainedTokenizerFast):
)
self.add_bos_token = kwargs.pop("add_bos_token", False)
pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
pre_tok_state["add_prefix_space"] = add_prefix_space
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
self.add_prefix_space = add_prefix_space
@property

View File

@ -94,14 +94,9 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast):
unk_token="<|endoftext|>",
bos_token="<|endoftext|>",
eos_token="<|endoftext|>",
add_prefix_space=None,
add_prefix_space=False,
**kwargs,
):
if add_prefix_space is not None:
kwargs["from_slow"] = True
else:
add_prefix_space = False
super().__init__(
vocab_file,
merges_file,
@ -115,6 +110,11 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast):
self.add_bos_token = kwargs.pop("add_bos_token", False)
pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
pre_tok_state["add_prefix_space"] = add_prefix_space
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
self.add_prefix_space = add_prefix_space

View File

@ -91,14 +91,9 @@ class GPTNeoXTokenizerFast(PreTrainedTokenizerFast):
unk_token="<|endoftext|>",
bos_token="<|endoftext|>",
eos_token="<|endoftext|>",
add_prefix_space=None,
add_prefix_space=False,
**kwargs,
):
if add_prefix_space is not None:
kwargs["from_slow"] = True
else:
add_prefix_space = False
super().__init__(
vocab_file,
merges_file,
@ -110,6 +105,12 @@ class GPTNeoXTokenizerFast(PreTrainedTokenizerFast):
**kwargs,
)
pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
pre_tok_state["add_prefix_space"] = add_prefix_space
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
self.add_prefix_space = add_prefix_space
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:

View File

@ -131,7 +131,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
add_prefix_space=None,
add_prefix_space=True,
trim_offsets=True,
cls_token_box=[0, 0, 0, 0],
sep_token_box=[0, 0, 0, 0],
@ -140,11 +140,6 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
only_label_first_subword=True,
**kwargs,
):
if add_prefix_space is not None:
kwargs["from_slow"] = True
else:
add_prefix_space = True
super().__init__(
vocab_file,
merges_file,

View File

@ -130,15 +130,10 @@ class LEDTokenizerFast(PreTrainedTokenizerFast):
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
add_prefix_space=None,
add_prefix_space=False,
trim_offsets=True,
**kwargs,
):
if add_prefix_space is not None:
kwargs["from_slow"] = True
else:
add_prefix_space = False
# we have to specify that this tokens is special otherwise adding it will reset the normalized flag to `False` in `add_special_tokens`
mask_token = (
AddedToken(mask_token, lstrip=True, normalized=True, special=True)
@ -162,6 +157,12 @@ class LEDTokenizerFast(PreTrainedTokenizerFast):
**kwargs,
)
pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
pre_tok_state["add_prefix_space"] = add_prefix_space
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
self.add_prefix_space = add_prefix_space
# the pre_tokenizer is already updated in the GPT2TokenizerFast `__init__`

View File

@ -132,11 +132,6 @@ class LongformerTokenizerFast(PreTrainedTokenizerFast):
trim_offsets=True,
**kwargs,
):
if add_prefix_space is not None:
kwargs["from_slow"] = True
else:
add_prefix_space = False
mask_token = (
AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
if isinstance(mask_token, str)

View File

@ -155,7 +155,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
add_prefix_space=None,
add_prefix_space=False,
max_depth=50,
max_width=1000,
pad_width=1001,
@ -174,11 +174,6 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
# Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
if add_prefix_space is not None:
kwargs["from_slow"] = True
else:
add_prefix_space = False
super().__init__(
vocab_file=vocab_file,
merges_file=merges_file,
@ -212,6 +207,12 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
self.tags_dict = tags_dict
pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
pre_tok_state["add_prefix_space"] = add_prefix_space
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
self.add_prefix_space = add_prefix_space
tokenizer_component = "post_processor"

View File

@ -130,7 +130,7 @@ class MvpTokenizerFast(PreTrainedTokenizerFast):
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
add_prefix_space=None,
add_prefix_space=False,
trim_offsets=True,
**kwargs,
):
@ -143,12 +143,8 @@ class MvpTokenizerFast(PreTrainedTokenizerFast):
# Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
if add_prefix_space is not None:
if add_prefix_space:
kwargs["from_slow"] = True
else:
add_prefix_space = False
super().__init__(
vocab_file,
merges_file,

View File

@ -127,7 +127,7 @@ class RobertaTokenizerFast(PreTrainedTokenizerFast):
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
add_prefix_space=None,
add_prefix_space=False,
trim_offsets=True,
**kwargs,
):
@ -136,11 +136,6 @@ class RobertaTokenizerFast(PreTrainedTokenizerFast):
if isinstance(mask_token, str)
else mask_token
)
if add_prefix_space is not None:
kwargs["from_slow"] = True
else:
add_prefix_space = False
super().__init__(
vocab_file,
merges_file,
@ -158,6 +153,12 @@ class RobertaTokenizerFast(PreTrainedTokenizerFast):
**kwargs,
)
pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
pre_tok_state["add_prefix_space"] = add_prefix_space
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
self.add_prefix_space = add_prefix_space
tokenizer_component = "post_processor"