Compare commits
3 Commits
main
...
test_token
Author | SHA1 | Date |
---|---|---|
Nicolas Patry | 9a72749b0f | |
Nicolas Patry | 17daeefa7c | |
Nicolas Patry | 5f770d4cab |
|
@ -46,12 +46,16 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
|
||||||
)
|
)
|
||||||
tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
|
tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
|
||||||
[
|
[
|
||||||
pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space),
|
pre_tokenizers.Metaspace(
|
||||||
|
replacement=replacement, add_prefix_space="always" if add_prefix_space else "never"
|
||||||
|
),
|
||||||
pre_tokenizers.Digits(individual_digits=True),
|
pre_tokenizers.Digits(individual_digits=True),
|
||||||
pre_tokenizers.Punctuation(),
|
pre_tokenizers.Punctuation(),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
|
tokenizer.decoder = decoders.Metaspace(
|
||||||
|
replacement=replacement, add_prefix_space="always" if add_prefix_space else "never"
|
||||||
|
)
|
||||||
|
|
||||||
tokenizer.post_processor = TemplateProcessing(
|
tokenizer.post_processor = TemplateProcessing(
|
||||||
single=f"$A {self.special_tokens['eos']['token']}",
|
single=f"$A {self.special_tokens['eos']['token']}",
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -174,7 +174,7 @@ _deps = [
|
||||||
"tf2onnx",
|
"tf2onnx",
|
||||||
"timeout-decorator",
|
"timeout-decorator",
|
||||||
"timm",
|
"timm",
|
||||||
"tokenizers>=0.14,<0.19",
|
"tokenizers==0.19.0rc0",
|
||||||
"torch",
|
"torch",
|
||||||
"torchaudio",
|
"torchaudio",
|
||||||
"torchvision",
|
"torchvision",
|
||||||
|
|
|
@ -43,6 +43,16 @@ def import_protobuf(error_message=""):
|
||||||
raise ImportError(PROTOBUF_IMPORT_ERROR.format(error_message))
|
raise ImportError(PROTOBUF_IMPORT_ERROR.format(error_message))
|
||||||
|
|
||||||
|
|
||||||
|
def _get_prepend_scheme(add_prefix_space: bool, original_tokenizer) -> str:
|
||||||
|
if add_prefix_space:
|
||||||
|
prepend_scheme = "always"
|
||||||
|
if hasattr(original_tokenizer, "legacy") and not original_tokenizer.legacy:
|
||||||
|
prepend_scheme = "first"
|
||||||
|
else:
|
||||||
|
prepend_scheme = "never"
|
||||||
|
return prepend_scheme
|
||||||
|
|
||||||
|
|
||||||
class SentencePieceExtractor:
|
class SentencePieceExtractor:
|
||||||
"""
|
"""
|
||||||
Extractor implementation for SentencePiece trained models. https://github.com/google/sentencepiece
|
Extractor implementation for SentencePiece trained models. https://github.com/google/sentencepiece
|
||||||
|
@ -597,18 +607,15 @@ class SpmConverter(Converter):
|
||||||
return normalizers.Sequence([normalizers.Precompiled(precompiled_charsmap)] + _normalizers)
|
return normalizers.Sequence([normalizers.Precompiled(precompiled_charsmap)] + _normalizers)
|
||||||
|
|
||||||
def pre_tokenizer(self, replacement, add_prefix_space):
|
def pre_tokenizer(self, replacement, add_prefix_space):
|
||||||
prepend_scheme = "always"
|
prepend_scheme = _get_prepend_scheme(add_prefix_space, self.original_tokenizer)
|
||||||
if hasattr(self.original_tokenizer, "legacy") and not self.original_tokenizer.legacy:
|
return pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
|
||||||
prepend_scheme = "first"
|
|
||||||
return pre_tokenizers.Metaspace(
|
|
||||||
replacement=replacement, add_prefix_space=add_prefix_space, prepend_scheme=prepend_scheme
|
|
||||||
)
|
|
||||||
|
|
||||||
def post_processor(self):
|
def post_processor(self):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def decoder(self, replacement, add_prefix_space):
|
def decoder(self, replacement, add_prefix_space):
|
||||||
return decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
|
prepend_scheme = _get_prepend_scheme(add_prefix_space, self.original_tokenizer)
|
||||||
|
return decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
|
||||||
|
|
||||||
def converted(self) -> Tokenizer:
|
def converted(self) -> Tokenizer:
|
||||||
tokenizer = self.tokenizer(self.proto)
|
tokenizer = self.tokenizer(self.proto)
|
||||||
|
@ -722,7 +729,8 @@ class DebertaV2Converter(SpmConverter):
|
||||||
list_pretokenizers = []
|
list_pretokenizers = []
|
||||||
if self.original_tokenizer.split_by_punct:
|
if self.original_tokenizer.split_by_punct:
|
||||||
list_pretokenizers.append(pre_tokenizers.Punctuation(behavior="isolated"))
|
list_pretokenizers.append(pre_tokenizers.Punctuation(behavior="isolated"))
|
||||||
list_pretokenizers.append(pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space))
|
prepend_scheme = _get_prepend_scheme(add_prefix_space, self.original_tokenizer)
|
||||||
|
list_pretokenizers.append(pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme))
|
||||||
return pre_tokenizers.Sequence(list_pretokenizers)
|
return pre_tokenizers.Sequence(list_pretokenizers)
|
||||||
|
|
||||||
def normalizer(self, proto):
|
def normalizer(self, proto):
|
||||||
|
@ -1007,10 +1015,11 @@ class PegasusConverter(SpmConverter):
|
||||||
return proto.trainer_spec.unk_id + self.original_tokenizer.offset
|
return proto.trainer_spec.unk_id + self.original_tokenizer.offset
|
||||||
|
|
||||||
def pre_tokenizer(self, replacement, add_prefix_space):
|
def pre_tokenizer(self, replacement, add_prefix_space):
|
||||||
|
prepend_scheme = _get_prepend_scheme(add_prefix_space, self.original_tokenizer)
|
||||||
return pre_tokenizers.Sequence(
|
return pre_tokenizers.Sequence(
|
||||||
[
|
[
|
||||||
pre_tokenizers.WhitespaceSplit(),
|
pre_tokenizers.WhitespaceSplit(),
|
||||||
pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space),
|
pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -79,7 +79,7 @@ deps = {
|
||||||
"tf2onnx": "tf2onnx",
|
"tf2onnx": "tf2onnx",
|
||||||
"timeout-decorator": "timeout-decorator",
|
"timeout-decorator": "timeout-decorator",
|
||||||
"timm": "timm",
|
"timm": "timm",
|
||||||
"tokenizers": "tokenizers>=0.14,<0.19",
|
"tokenizers": "tokenizers==0.19.0rc0",
|
||||||
"torch": "torch",
|
"torch": "torch",
|
||||||
"torchaudio": "torchaudio",
|
"torchaudio": "torchaudio",
|
||||||
"torchvision": "torchvision",
|
"torchvision": "torchvision",
|
||||||
|
|
Loading…
Reference in New Issue