Compare commits

...

3 Commits

Author SHA1 Message Date
Nicolas Patry 9a72749b0f Ruff. 2024-04-17 10:21:11 +02:00
Nicolas Patry 17daeefa7c Accounting for the breaking change. 2024-04-17 10:21:11 +02:00
Nicolas Patry 5f770d4cab [DO NOT MERGE] Testing tokenizers 0.19.0rc0 2024-04-17 10:21:11 +02:00
4 changed files with 26 additions and 13 deletions

View File

@ -46,12 +46,16 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
) )
tokenizer.pre_tokenizer = pre_tokenizers.Sequence( tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
[ [
pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space), pre_tokenizers.Metaspace(
replacement=replacement, add_prefix_space="always" if add_prefix_space else "never"
),
pre_tokenizers.Digits(individual_digits=True), pre_tokenizers.Digits(individual_digits=True),
pre_tokenizers.Punctuation(), pre_tokenizers.Punctuation(),
] ]
) )
tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space) tokenizer.decoder = decoders.Metaspace(
replacement=replacement, add_prefix_space="always" if add_prefix_space else "never"
)
tokenizer.post_processor = TemplateProcessing( tokenizer.post_processor = TemplateProcessing(
single=f"$A {self.special_tokens['eos']['token']}", single=f"$A {self.special_tokens['eos']['token']}",

View File

@ -174,7 +174,7 @@ _deps = [
"tf2onnx", "tf2onnx",
"timeout-decorator", "timeout-decorator",
"timm", "timm",
"tokenizers>=0.14,<0.19", "tokenizers==0.19.0rc0",
"torch", "torch",
"torchaudio", "torchaudio",
"torchvision", "torchvision",

View File

@ -43,6 +43,16 @@ def import_protobuf(error_message=""):
raise ImportError(PROTOBUF_IMPORT_ERROR.format(error_message)) raise ImportError(PROTOBUF_IMPORT_ERROR.format(error_message))
def _get_prepend_scheme(add_prefix_space: bool, original_tokenizer) -> str:
if add_prefix_space:
prepend_scheme = "always"
if hasattr(original_tokenizer, "legacy") and not original_tokenizer.legacy:
prepend_scheme = "first"
else:
prepend_scheme = "never"
return prepend_scheme
class SentencePieceExtractor: class SentencePieceExtractor:
""" """
Extractor implementation for SentencePiece trained models. https://github.com/google/sentencepiece Extractor implementation for SentencePiece trained models. https://github.com/google/sentencepiece
@ -597,18 +607,15 @@ class SpmConverter(Converter):
return normalizers.Sequence([normalizers.Precompiled(precompiled_charsmap)] + _normalizers) return normalizers.Sequence([normalizers.Precompiled(precompiled_charsmap)] + _normalizers)
def pre_tokenizer(self, replacement, add_prefix_space): def pre_tokenizer(self, replacement, add_prefix_space):
prepend_scheme = "always" prepend_scheme = _get_prepend_scheme(add_prefix_space, self.original_tokenizer)
if hasattr(self.original_tokenizer, "legacy") and not self.original_tokenizer.legacy: return pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
prepend_scheme = "first"
return pre_tokenizers.Metaspace(
replacement=replacement, add_prefix_space=add_prefix_space, prepend_scheme=prepend_scheme
)
def post_processor(self): def post_processor(self):
return None return None
def decoder(self, replacement, add_prefix_space): def decoder(self, replacement, add_prefix_space):
return decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space) prepend_scheme = _get_prepend_scheme(add_prefix_space, self.original_tokenizer)
return decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
def converted(self) -> Tokenizer: def converted(self) -> Tokenizer:
tokenizer = self.tokenizer(self.proto) tokenizer = self.tokenizer(self.proto)
@ -722,7 +729,8 @@ class DebertaV2Converter(SpmConverter):
list_pretokenizers = [] list_pretokenizers = []
if self.original_tokenizer.split_by_punct: if self.original_tokenizer.split_by_punct:
list_pretokenizers.append(pre_tokenizers.Punctuation(behavior="isolated")) list_pretokenizers.append(pre_tokenizers.Punctuation(behavior="isolated"))
list_pretokenizers.append(pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)) prepend_scheme = _get_prepend_scheme(add_prefix_space, self.original_tokenizer)
list_pretokenizers.append(pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme))
return pre_tokenizers.Sequence(list_pretokenizers) return pre_tokenizers.Sequence(list_pretokenizers)
def normalizer(self, proto): def normalizer(self, proto):
@ -1007,10 +1015,11 @@ class PegasusConverter(SpmConverter):
return proto.trainer_spec.unk_id + self.original_tokenizer.offset return proto.trainer_spec.unk_id + self.original_tokenizer.offset
def pre_tokenizer(self, replacement, add_prefix_space): def pre_tokenizer(self, replacement, add_prefix_space):
prepend_scheme = _get_prepend_scheme(add_prefix_space, self.original_tokenizer)
return pre_tokenizers.Sequence( return pre_tokenizers.Sequence(
[ [
pre_tokenizers.WhitespaceSplit(), pre_tokenizers.WhitespaceSplit(),
pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space), pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme),
] ]
) )

View File

@ -79,7 +79,7 @@ deps = {
"tf2onnx": "tf2onnx", "tf2onnx": "tf2onnx",
"timeout-decorator": "timeout-decorator", "timeout-decorator": "timeout-decorator",
"timm": "timm", "timm": "timm",
"tokenizers": "tokenizers>=0.14,<0.19", "tokenizers": "tokenizers==0.19.0rc0",
"torch": "torch", "torch": "torch",
"torchaudio": "torchaudio", "torchaudio": "torchaudio",
"torchvision": "torchvision", "torchvision": "torchvision",