Compare commits

...

24 Commits

Author SHA1 Message Date
Arthur Zucker d5cb129e3d fixup 2024-04-17 10:55:42 +02:00
Nicolas Patry d5a20b8e94 Accounting for the breaking change. 2024-04-17 10:55:11 +02:00
Arthur Zucker ea98377415 oups 2024-04-17 10:49:48 +02:00
Arthur Zucker cfccbcc513 "tokenizers": "tokenizers>=0.19,<0.20", 2024-04-17 10:49:17 +02:00
Arthur Zucker 884a51f516 last two 2024-04-17 10:17:22 +02:00
Arthur Zucker 4a2b4fe00d more revert 2024-04-17 10:16:43 +02:00
Arthur Zucker 07ba6e11fc revert plus 0.19 2024-04-17 10:14:51 +02:00
Arthur Zucker 1b945739ae Remove all post init strangeness 2024-04-17 09:48:12 +02:00
Arthur Zucker eb541a4daa fix-copies 2024-04-17 09:42:01 +02:00
Arthur Zucker 7fa78347b5 more fixes: use from_slow whenever default prefix space is not passed 2024-04-17 09:36:16 +02:00
Arthur Zucker 88b3017eec fix convert_slow 2024-04-17 09:30:17 +02:00
Arthur Zucker 53930586a2 Merge branch 'main' of github.com:huggingface/transformers into update-tokenizers-version 2024-04-16 22:05:26 +02:00
Arthur Zucker d96b9f1c36 AH 2024-04-16 22:04:51 +02:00
Arthur Zucker 0f0fb87d0f decoders do not have a prepend scheme duh 2024-04-16 21:40:57 +02:00
Arthur Zucker 419fe50fd6 style? 2024-04-15 19:14:18 +02:00
Arthur Zucker d7386408e6 update 2024-04-15 19:00:49 +02:00
Arthur Zucker b65cf702fa fixup 2024-04-15 18:58:25 +02:00
Arthur Zucker 932b296c28 nit 2024-04-15 18:57:58 +02:00
Arthur Zucker 2447480da4 fix some fast tokenizers 2024-04-15 18:56:43 +02:00
Arthur Zucker dafccda814 style 2024-04-15 18:52:52 +02:00
Arthur Zucker c62fb163a3 Merge branch 'main' of github.com:huggingface/transformers into update-tokenizers-version 2024-04-15 18:50:46 +02:00
Arthur Zucker f77e8c3944 use rc1 2024-04-15 18:39:08 +02:00
Arthur Zucker 9a16c5d429 update tokenizer version 2024-04-15 18:38:23 +02:00
Arthur Zucker ca5d48983b update to tokenizers version 2024-04-02 10:45:27 +02:00
4 changed files with 29 additions and 16 deletions

View File

@ -46,12 +46,16 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
)
tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
[
pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space),
pre_tokenizers.Metaspace(
replacement=replacement, add_prefix_space="always" if add_prefix_space else "never"
),
pre_tokenizers.Digits(individual_digits=True),
pre_tokenizers.Punctuation(),
]
)
tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
tokenizer.decoder = decoders.Metaspace(
replacement=replacement, add_prefix_space="always" if add_prefix_space else "never"
)
tokenizer.post_processor = TemplateProcessing(
single=f"$A {self.special_tokens['eos']['token']}",

View File

@ -174,7 +174,7 @@ _deps = [
"tf2onnx",
"timeout-decorator",
"timm",
"tokenizers>=0.14,<0.19",
"tokenizers>=0.19,<0.20",
"torch",
"torchaudio",
"torchvision",

View File

@ -23,6 +23,7 @@ import warnings
from typing import Dict, List, Tuple
from packaging import version
from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
from tokenizers.models import BPE, Unigram, WordPiece
@ -43,6 +44,16 @@ def import_protobuf(error_message=""):
raise ImportError(PROTOBUF_IMPORT_ERROR.format(error_message))
def _get_prepend_scheme(add_prefix_space: bool, original_tokenizer) -> str:
if add_prefix_space:
prepend_scheme = "always"
if hasattr(original_tokenizer, "legacy") and not original_tokenizer.legacy:
prepend_scheme = "first"
else:
prepend_scheme = "never"
return prepend_scheme
class SentencePieceExtractor:
"""
Extractor implementation for SentencePiece trained models. https://github.com/google/sentencepiece
@ -597,18 +608,15 @@ class SpmConverter(Converter):
return normalizers.Sequence([normalizers.Precompiled(precompiled_charsmap)] + _normalizers)
def pre_tokenizer(self, replacement, add_prefix_space):
prepend_scheme = "always"
if hasattr(self.original_tokenizer, "legacy") and not self.original_tokenizer.legacy:
prepend_scheme = "first"
return pre_tokenizers.Metaspace(
replacement=replacement, add_prefix_space=add_prefix_space, prepend_scheme=prepend_scheme
)
prepend_scheme = _get_prepend_scheme(add_prefix_space, self.original_tokenizer)
return pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
def post_processor(self):
return None
def decoder(self, replacement, add_prefix_space):
return decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
prepend_scheme = _get_prepend_scheme(add_prefix_space, self.original_tokenizer)
return decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
def converted(self) -> Tokenizer:
tokenizer = self.tokenizer(self.proto)
@ -722,7 +730,8 @@ class DebertaV2Converter(SpmConverter):
list_pretokenizers = []
if self.original_tokenizer.split_by_punct:
list_pretokenizers.append(pre_tokenizers.Punctuation(behavior="isolated"))
list_pretokenizers.append(pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space))
prepend_scheme = _get_prepend_scheme(add_prefix_space, self.original_tokenizer)
list_pretokenizers.append(pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme))
return pre_tokenizers.Sequence(list_pretokenizers)
def normalizer(self, proto):
@ -1007,10 +1016,11 @@ class PegasusConverter(SpmConverter):
return proto.trainer_spec.unk_id + self.original_tokenizer.offset
def pre_tokenizer(self, replacement, add_prefix_space):
prepend_scheme = _get_prepend_scheme(add_prefix_space, self.original_tokenizer)
return pre_tokenizers.Sequence(
[
pre_tokenizers.WhitespaceSplit(),
pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space),
pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme),
]
)
@ -1385,9 +1395,8 @@ class LlamaConverter(SpmConverter):
def normalizer(self, proto):
sequence = []
if hasattr(self.original_tokenizer, "add_prefix_space"):
if self.original_tokenizer.add_prefix_space:
sequence += [normalizers.Prepend(prepend="")]
if getattr(self.original_tokenizer, "add_prefix_space", False):
sequence += [normalizers.Prepend(prepend="")]
sequence += [normalizers.Replace(pattern=" ", content="")]
return normalizers.Sequence(sequence)

View File

@ -79,7 +79,7 @@ deps = {
"tf2onnx": "tf2onnx",
"timeout-decorator": "timeout-decorator",
"timm": "timm",
"tokenizers": "tokenizers>=0.14,<0.19",
"tokenizers": "tokenizers>=0.19,<0.20",
"torch": "torch",
"torchaudio": "torchaudio",
"torchvision": "torchvision",