revert plus 0.19
This commit is contained in:
parent
1b945739ae
commit
07ba6e11fc
2
setup.py
2
setup.py
|
@ -174,7 +174,7 @@ _deps = [
|
|||
"tf2onnx",
|
||||
"timeout-decorator",
|
||||
"timm",
|
||||
"tokenizers>=0.16.0rc1",
|
||||
"tokenizers>=0.19.0rc0",
|
||||
"torch",
|
||||
"torchaudio",
|
||||
"torchvision",
|
||||
|
|
|
@ -79,7 +79,7 @@ deps = {
|
|||
"tf2onnx": "tf2onnx",
|
||||
"timeout-decorator": "timeout-decorator",
|
||||
"timm": "timm",
|
||||
"tokenizers": "tokenizers>=0.16.0rc1",
|
||||
"tokenizers": "tokenizers>=0.19.0rc0",
|
||||
"torch": "torch",
|
||||
"torchaudio": "torchaudio",
|
||||
"torchvision": "torchvision",
|
||||
|
|
|
@ -129,11 +129,6 @@ class CohereTokenizerFast(PreTrainedTokenizerFast):
|
|||
add_prefix_space=False,
|
||||
**kwargs,
|
||||
):
|
||||
if add_prefix_space is not None:
|
||||
kwargs["from_slow"] = True
|
||||
else:
|
||||
add_prefix_space = False
|
||||
|
||||
super().__init__(
|
||||
vocab_file=vocab_file,
|
||||
merges_file=merges_file,
|
||||
|
@ -156,6 +151,17 @@ class CohereTokenizerFast(PreTrainedTokenizerFast):
|
|||
self.grounded_generation_template = kwargs.pop("grounded_generation_template", None)
|
||||
self.tool_use_template = kwargs.pop("tool_use_template", None)
|
||||
|
||||
# TODO @ArthurZucker this can only work one way for now, to update later-on. Tests should also properly
|
||||
# check this as they were green before.
|
||||
pre_tok_state = pickle.dumps(self.backend_tokenizer.pre_tokenizer)
|
||||
decoder_state = pickle.dumps(self.backend_tokenizer.decoder)
|
||||
|
||||
if add_prefix_space:
|
||||
pre_tok_state = pre_tok_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
|
||||
decoder_state = decoder_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
|
||||
self.backend_tokenizer.pre_tokenizer = pickle.loads(pre_tok_state)
|
||||
self.backend_tokenizer.decoder = pickle.loads(decoder_state)
|
||||
|
||||
self.add_prefix_space = add_prefix_space
|
||||
|
||||
def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
|
||||
|
|
|
@ -112,14 +112,9 @@ class DebertaTokenizerFast(PreTrainedTokenizerFast):
|
|||
unk_token="[UNK]",
|
||||
pad_token="[PAD]",
|
||||
mask_token="[MASK]",
|
||||
add_prefix_space=None,
|
||||
add_prefix_space=False,
|
||||
**kwargs,
|
||||
):
|
||||
if add_prefix_space is not None:
|
||||
kwargs["from_slow"] = True
|
||||
else:
|
||||
add_prefix_space = False
|
||||
|
||||
super().__init__(
|
||||
vocab_file,
|
||||
merges_file,
|
||||
|
@ -137,6 +132,12 @@ class DebertaTokenizerFast(PreTrainedTokenizerFast):
|
|||
)
|
||||
self.add_bos_token = kwargs.pop("add_bos_token", False)
|
||||
|
||||
pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
|
||||
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
|
||||
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
|
||||
pre_tok_state["add_prefix_space"] = add_prefix_space
|
||||
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
|
||||
|
||||
self.add_prefix_space = add_prefix_space
|
||||
|
||||
@property
|
||||
|
|
|
@ -94,14 +94,9 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast):
|
|||
unk_token="<|endoftext|>",
|
||||
bos_token="<|endoftext|>",
|
||||
eos_token="<|endoftext|>",
|
||||
add_prefix_space=None,
|
||||
add_prefix_space=False,
|
||||
**kwargs,
|
||||
):
|
||||
if add_prefix_space is not None:
|
||||
kwargs["from_slow"] = True
|
||||
else:
|
||||
add_prefix_space = False
|
||||
|
||||
super().__init__(
|
||||
vocab_file,
|
||||
merges_file,
|
||||
|
@ -115,6 +110,11 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast):
|
|||
|
||||
self.add_bos_token = kwargs.pop("add_bos_token", False)
|
||||
|
||||
pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
|
||||
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
|
||||
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
|
||||
pre_tok_state["add_prefix_space"] = add_prefix_space
|
||||
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
|
||||
|
||||
self.add_prefix_space = add_prefix_space
|
||||
|
||||
|
|
|
@ -91,14 +91,9 @@ class GPTNeoXTokenizerFast(PreTrainedTokenizerFast):
|
|||
unk_token="<|endoftext|>",
|
||||
bos_token="<|endoftext|>",
|
||||
eos_token="<|endoftext|>",
|
||||
add_prefix_space=None,
|
||||
add_prefix_space=False,
|
||||
**kwargs,
|
||||
):
|
||||
if add_prefix_space is not None:
|
||||
kwargs["from_slow"] = True
|
||||
else:
|
||||
add_prefix_space = False
|
||||
|
||||
super().__init__(
|
||||
vocab_file,
|
||||
merges_file,
|
||||
|
@ -110,6 +105,12 @@ class GPTNeoXTokenizerFast(PreTrainedTokenizerFast):
|
|||
**kwargs,
|
||||
)
|
||||
|
||||
pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
|
||||
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
|
||||
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
|
||||
pre_tok_state["add_prefix_space"] = add_prefix_space
|
||||
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
|
||||
|
||||
self.add_prefix_space = add_prefix_space
|
||||
|
||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||
|
|
|
@ -131,7 +131,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
|
|||
unk_token="<unk>",
|
||||
pad_token="<pad>",
|
||||
mask_token="<mask>",
|
||||
add_prefix_space=None,
|
||||
add_prefix_space=True,
|
||||
trim_offsets=True,
|
||||
cls_token_box=[0, 0, 0, 0],
|
||||
sep_token_box=[0, 0, 0, 0],
|
||||
|
@ -140,11 +140,6 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
|
|||
only_label_first_subword=True,
|
||||
**kwargs,
|
||||
):
|
||||
if add_prefix_space is not None:
|
||||
kwargs["from_slow"] = True
|
||||
else:
|
||||
add_prefix_space = True
|
||||
|
||||
super().__init__(
|
||||
vocab_file,
|
||||
merges_file,
|
||||
|
|
|
@ -130,15 +130,10 @@ class LEDTokenizerFast(PreTrainedTokenizerFast):
|
|||
unk_token="<unk>",
|
||||
pad_token="<pad>",
|
||||
mask_token="<mask>",
|
||||
add_prefix_space=None,
|
||||
add_prefix_space=False,
|
||||
trim_offsets=True,
|
||||
**kwargs,
|
||||
):
|
||||
if add_prefix_space is not None:
|
||||
kwargs["from_slow"] = True
|
||||
else:
|
||||
add_prefix_space = False
|
||||
|
||||
# we have to specify that this tokens is special otherwise adding it will reset the normalized flag to `False` in `add_special_tokens`
|
||||
mask_token = (
|
||||
AddedToken(mask_token, lstrip=True, normalized=True, special=True)
|
||||
|
@ -162,6 +157,12 @@ class LEDTokenizerFast(PreTrainedTokenizerFast):
|
|||
**kwargs,
|
||||
)
|
||||
|
||||
pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
|
||||
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
|
||||
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
|
||||
pre_tok_state["add_prefix_space"] = add_prefix_space
|
||||
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
|
||||
|
||||
self.add_prefix_space = add_prefix_space
|
||||
|
||||
# the pre_tokenizer is already updated in the GPT2TokenizerFast `__init__`
|
||||
|
|
|
@ -132,11 +132,6 @@ class LongformerTokenizerFast(PreTrainedTokenizerFast):
|
|||
trim_offsets=True,
|
||||
**kwargs,
|
||||
):
|
||||
if add_prefix_space is not None:
|
||||
kwargs["from_slow"] = True
|
||||
else:
|
||||
add_prefix_space = False
|
||||
|
||||
mask_token = (
|
||||
AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
|
||||
if isinstance(mask_token, str)
|
||||
|
|
|
@ -155,7 +155,7 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
|
|||
unk_token="<unk>",
|
||||
pad_token="<pad>",
|
||||
mask_token="<mask>",
|
||||
add_prefix_space=None,
|
||||
add_prefix_space=False,
|
||||
max_depth=50,
|
||||
max_width=1000,
|
||||
pad_width=1001,
|
||||
|
@ -174,11 +174,6 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
|
|||
# Mask token behave like a normal word, i.e. include the space before it
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
|
||||
if add_prefix_space is not None:
|
||||
kwargs["from_slow"] = True
|
||||
else:
|
||||
add_prefix_space = False
|
||||
|
||||
super().__init__(
|
||||
vocab_file=vocab_file,
|
||||
merges_file=merges_file,
|
||||
|
@ -212,6 +207,12 @@ class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
|
|||
|
||||
self.tags_dict = tags_dict
|
||||
|
||||
pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
|
||||
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
|
||||
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
|
||||
pre_tok_state["add_prefix_space"] = add_prefix_space
|
||||
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
|
||||
|
||||
self.add_prefix_space = add_prefix_space
|
||||
|
||||
tokenizer_component = "post_processor"
|
||||
|
|
|
@ -130,7 +130,7 @@ class MvpTokenizerFast(PreTrainedTokenizerFast):
|
|||
unk_token="<unk>",
|
||||
pad_token="<pad>",
|
||||
mask_token="<mask>",
|
||||
add_prefix_space=None,
|
||||
add_prefix_space=False,
|
||||
trim_offsets=True,
|
||||
**kwargs,
|
||||
):
|
||||
|
@ -143,12 +143,8 @@ class MvpTokenizerFast(PreTrainedTokenizerFast):
|
|||
|
||||
# Mask token behave like a normal word, i.e. include the space before it
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
|
||||
if add_prefix_space is not None:
|
||||
if add_prefix_space:
|
||||
kwargs["from_slow"] = True
|
||||
else:
|
||||
add_prefix_space = False
|
||||
|
||||
super().__init__(
|
||||
vocab_file,
|
||||
merges_file,
|
||||
|
|
|
@ -127,7 +127,7 @@ class RobertaTokenizerFast(PreTrainedTokenizerFast):
|
|||
unk_token="<unk>",
|
||||
pad_token="<pad>",
|
||||
mask_token="<mask>",
|
||||
add_prefix_space=None,
|
||||
add_prefix_space=False,
|
||||
trim_offsets=True,
|
||||
**kwargs,
|
||||
):
|
||||
|
@ -136,11 +136,6 @@ class RobertaTokenizerFast(PreTrainedTokenizerFast):
|
|||
if isinstance(mask_token, str)
|
||||
else mask_token
|
||||
)
|
||||
if add_prefix_space is not None:
|
||||
kwargs["from_slow"] = True
|
||||
else:
|
||||
add_prefix_space = False
|
||||
|
||||
super().__init__(
|
||||
vocab_file,
|
||||
merges_file,
|
||||
|
@ -158,6 +153,12 @@ class RobertaTokenizerFast(PreTrainedTokenizerFast):
|
|||
**kwargs,
|
||||
)
|
||||
|
||||
pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
|
||||
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
|
||||
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
|
||||
pre_tok_state["add_prefix_space"] = add_prefix_space
|
||||
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
|
||||
|
||||
self.add_prefix_space = add_prefix_space
|
||||
|
||||
tokenizer_component = "post_processor"
|
||||
|
|
Loading…
Reference in New Issue