This commit is contained in:
Arthur Zucker 2024-04-17 10:55:42 +02:00
parent d5a20b8e94
commit d5cb129e3d
2 changed files with 9 additions and 5 deletions

View File

@ -46,12 +46,16 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
)
tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
[
pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space="always" if add_prefix_space else "never"),
pre_tokenizers.Metaspace(
replacement=replacement, add_prefix_space="always" if add_prefix_space else "never"
),
pre_tokenizers.Digits(individual_digits=True),
pre_tokenizers.Punctuation(),
]
)
tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space="always" if add_prefix_space else "never")
tokenizer.decoder = decoders.Metaspace(
replacement=replacement, add_prefix_space="always" if add_prefix_space else "never"
)
tokenizer.post_processor = TemplateProcessing(
single=f"$A {self.special_tokens['eos']['token']}",

View File

@ -23,6 +23,7 @@ import warnings
from typing import Dict, List, Tuple
from packaging import version
from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
from tokenizers.models import BPE, Unigram, WordPiece
@ -42,6 +43,7 @@ def import_protobuf(error_message=""):
else:
raise ImportError(PROTOBUF_IMPORT_ERROR.format(error_message))
def _get_prepend_scheme(add_prefix_space: bool, original_tokenizer) -> str:
if add_prefix_space:
prepend_scheme = "always"
@ -607,9 +609,7 @@ class SpmConverter(Converter):
def pre_tokenizer(self, replacement, add_prefix_space):
prepend_scheme = _get_prepend_scheme(add_prefix_space, self.original_tokenizer)
return pre_tokenizers.Metaspace(
replacement=replacement, prepend_scheme=prepend_scheme
)
return pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
def post_processor(self):
return None