Compare commits
12 Commits
main
...
30824-spmc
Author | SHA1 | Date |
---|---|---|
Ita Zaporozhets | 74e78f1720 | |
Ita Zaporozhets | d92822e045 | |
Ita Zaporozhets | c416522a76 | |
Ita Zaporozhets | 84143a2cc3 | |
Ita Zaporozhets | ff5974bb61 | |
Ita Zaporozhets | fdb63e21db | |
Ita Zaporozhets | 79ce5bb67f | |
Ita Zaporozhets | 896b7d152e | |
Ita Zaporozhets | 7afb15921d | |
Ita Zaporozhets | 31fbe4f12c | |
Ita Zaporozhets | d1ea757c21 | |
Ita Zaporozhets | 24ea0cd756 |
|
@ -620,14 +620,26 @@ class SpmConverter(Converter):
|
||||||
def converted(self) -> Tokenizer:
|
def converted(self) -> Tokenizer:
|
||||||
tokenizer = self.tokenizer(self.proto)
|
tokenizer = self.tokenizer(self.proto)
|
||||||
|
|
||||||
|
# Add user defined symbols
|
||||||
|
user_defined_symbols = [
|
||||||
|
AddedToken(token, normalized=True, special=False) for token in self.proto.trainer_spec.user_defined_symbols
|
||||||
|
]
|
||||||
|
control_symbols = [
|
||||||
|
AddedToken(token, normalized=True, special=False) for token in self.proto.trainer_spec.control_symbols
|
||||||
|
]
|
||||||
|
tokenizer.add_tokens(user_defined_symbols + control_symbols)
|
||||||
|
|
||||||
# Tokenizer assemble
|
# Tokenizer assemble
|
||||||
normalizer = self.normalizer(self.proto)
|
normalizer = self.normalizer(self.proto)
|
||||||
if normalizer is not None:
|
if normalizer is not None:
|
||||||
tokenizer.normalizer = normalizer
|
tokenizer.normalizer = normalizer
|
||||||
|
|
||||||
replacement = "▁"
|
replacement = "▁"
|
||||||
add_prefix_space = True
|
# TODO:ita added 1
|
||||||
if hasattr(self.original_tokenizer, "add_prefix_space"):
|
add_prefix_space = self.proto.normalizer_spec.add_dummy_prefix
|
||||||
|
# tokenizer.add_prefix_space = add_prefix_space
|
||||||
|
|
||||||
|
if hasattr(self.original_tokenizer,"add_prefix_space") and self.original_tokenizer.add_prefix_space is not None:
|
||||||
add_prefix_space = self.original_tokenizer.add_prefix_space
|
add_prefix_space = self.original_tokenizer.add_prefix_space
|
||||||
|
|
||||||
pre_tokenizer = self.pre_tokenizer(replacement, add_prefix_space)
|
pre_tokenizer = self.pre_tokenizer(replacement, add_prefix_space)
|
||||||
|
|
|
@ -167,8 +167,8 @@ class LlamaTokenizer(PreTrainedTokenizer):
|
||||||
self.add_bos_token = add_bos_token
|
self.add_bos_token = add_bos_token
|
||||||
self.add_eos_token = add_eos_token
|
self.add_eos_token = add_eos_token
|
||||||
self.use_default_system_prompt = use_default_system_prompt
|
self.use_default_system_prompt = use_default_system_prompt
|
||||||
self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False))
|
|
||||||
self.add_prefix_space = add_prefix_space
|
self.add_prefix_space = add_prefix_space
|
||||||
|
self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False))
|
||||||
|
|
||||||
super().__init__(
|
super().__init__(
|
||||||
bos_token=bos_token,
|
bos_token=bos_token,
|
||||||
|
@ -203,6 +203,7 @@ class LlamaTokenizer(PreTrainedTokenizer):
|
||||||
model = model_pb2.ModelProto.FromString(sp_model)
|
model = model_pb2.ModelProto.FromString(sp_model)
|
||||||
normalizer_spec = model_pb2.NormalizerSpec()
|
normalizer_spec = model_pb2.NormalizerSpec()
|
||||||
normalizer_spec.add_dummy_prefix = False
|
normalizer_spec.add_dummy_prefix = False
|
||||||
|
self.add_prefix_space = normalizer_spec.add_dummy_prefix if self.add_prefix_space is None else self.add_prefix_space
|
||||||
model.normalizer_spec.MergeFrom(normalizer_spec)
|
model.normalizer_spec.MergeFrom(normalizer_spec)
|
||||||
sp_model = model.SerializeToString()
|
sp_model = model.SerializeToString()
|
||||||
tokenizer.LoadFromSerializedProto(sp_model)
|
tokenizer.LoadFromSerializedProto(sp_model)
|
||||||
|
|
|
@ -13,10 +13,12 @@
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
import os
|
import os
|
||||||
|
import json
|
||||||
|
|
||||||
from shutil import copyfile
|
from shutil import copyfile
|
||||||
from typing import Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
from tokenizers import processors
|
from tokenizers import pre_tokenizers, normalizers, processors
|
||||||
|
|
||||||
from ...tokenization_utils_fast import PreTrainedTokenizerFast
|
from ...tokenization_utils_fast import PreTrainedTokenizerFast
|
||||||
from ...utils import is_sentencepiece_available, logging
|
from ...utils import is_sentencepiece_available, logging
|
||||||
|
@ -150,7 +152,12 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
|
||||||
legacy = True
|
legacy = True
|
||||||
self.legacy = legacy
|
self.legacy = legacy
|
||||||
|
|
||||||
if add_prefix_space is not None:
|
#TODO: ita
|
||||||
|
self.add_prefix_space = add_prefix_space
|
||||||
|
# if add_prefix_space is not None:
|
||||||
|
# kwargs["from_slow"] = True
|
||||||
|
|
||||||
|
if self.force_from_slow() is True:
|
||||||
kwargs["from_slow"] = True
|
kwargs["from_slow"] = True
|
||||||
|
|
||||||
super().__init__(
|
super().__init__(
|
||||||
|
@ -203,6 +210,48 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
|
||||||
single=single, pair=pair, special_tokens=special_tokens
|
single=single, pair=pair, special_tokens=special_tokens
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def force_from_slow(self):
|
||||||
|
if getattr(self, "add_prefix_space") == None:
|
||||||
|
if getattr(self, "_tokenizer", None) is None:
|
||||||
|
return True
|
||||||
|
curr_normalizer = json.loads(self._tokenizer.normalizer.__getstate__().decode('utf-8'))
|
||||||
|
prepend_normalizer = [n for n in curr_normalizer['normalizers'] if n['type'] == 'Prepend']
|
||||||
|
if not prepend_normalizer:
|
||||||
|
return True
|
||||||
|
|
||||||
|
def update_normalizer(self):
|
||||||
|
"""Updates the underlying normalizer with the current `add_prefix_space` and `legacy` settings."""
|
||||||
|
sequence = []
|
||||||
|
if getattr(self, "legacy", True):
|
||||||
|
if getattr(self, "add_prefix_space", True):
|
||||||
|
sequence += [normalizers.Prepend(prepend="▁")]
|
||||||
|
sequence += [normalizers.Replace(pattern=" ", content="▁")]
|
||||||
|
|
||||||
|
elif not getattr(self, "legacy", True):
|
||||||
|
self._tokenizer.normalizer = normalizers.Sequence(sequence)
|
||||||
|
|
||||||
|
def update_pre_tokenizer(self):
|
||||||
|
"""Updates the underlying pre-tokenizer with the current `add_prefix_space` setting."""
|
||||||
|
sequence = []
|
||||||
|
if getattr(self, "add_prefix_space") == False:
|
||||||
|
prepend_scheme = "never"
|
||||||
|
elif getattr(self, "add_prefix_space") == None:
|
||||||
|
curr_normalizer = json.loads(self._tokenizer.normalizer.__getstate__().decode('utf-8'))
|
||||||
|
prepend_normalizer = [n for n in curr_normalizer['normalizers'] if n['type'] == 'Prepend']
|
||||||
|
if prepend_normalizer:
|
||||||
|
prepend_normalizer = prepend_normalizer[0]
|
||||||
|
replacement = prepend_normalizer['prepend']
|
||||||
|
self.add_prefix_space = True
|
||||||
|
else:
|
||||||
|
return
|
||||||
|
if getattr(self, "add_prefix_space", True):
|
||||||
|
prepend_scheme = "always"
|
||||||
|
if not getattr(self, "legacy", True):
|
||||||
|
prepend_scheme = "first"
|
||||||
|
self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement="▁", prepend_scheme=prepend_scheme,
|
||||||
|
split=False)
|
||||||
|
self.update_normalizer()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def add_eos_token(self):
|
def add_eos_token(self):
|
||||||
return self._add_eos_token
|
return self._add_eos_token
|
||||||
|
|
|
@ -134,7 +134,7 @@ class T5Tokenizer(PreTrainedTokenizer):
|
||||||
additional_special_tokens=None,
|
additional_special_tokens=None,
|
||||||
sp_model_kwargs: Optional[Dict[str, Any]] = None,
|
sp_model_kwargs: Optional[Dict[str, Any]] = None,
|
||||||
legacy=None,
|
legacy=None,
|
||||||
add_prefix_space=True,
|
add_prefix_space=None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> None:
|
) -> None:
|
||||||
pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token
|
pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token
|
||||||
|
@ -181,10 +181,10 @@ class T5Tokenizer(PreTrainedTokenizer):
|
||||||
legacy = True
|
legacy = True
|
||||||
|
|
||||||
self.legacy = legacy
|
self.legacy = legacy
|
||||||
|
self.add_prefix_space = add_prefix_space
|
||||||
self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False))
|
self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False))
|
||||||
self.vocab_file = vocab_file
|
self.vocab_file = vocab_file
|
||||||
self._extra_ids = extra_ids
|
self._extra_ids = extra_ids
|
||||||
self.add_prefix_space = add_prefix_space
|
|
||||||
|
|
||||||
super().__init__(
|
super().__init__(
|
||||||
eos_token=eos_token,
|
eos_token=eos_token,
|
||||||
|
@ -211,6 +211,7 @@ class T5Tokenizer(PreTrainedTokenizer):
|
||||||
model = model_pb2.ModelProto.FromString(sp_model)
|
model = model_pb2.ModelProto.FromString(sp_model)
|
||||||
normalizer_spec = model_pb2.NormalizerSpec()
|
normalizer_spec = model_pb2.NormalizerSpec()
|
||||||
normalizer_spec.add_dummy_prefix = False
|
normalizer_spec.add_dummy_prefix = False
|
||||||
|
self.add_prefix_space = normalizer_spec.add_dummy_prefix if self.add_prefix_space is None else self.add_prefix_space
|
||||||
model.normalizer_spec.MergeFrom(normalizer_spec)
|
model.normalizer_spec.MergeFrom(normalizer_spec)
|
||||||
sp_model = model.SerializeToString()
|
sp_model = model.SerializeToString()
|
||||||
tokenizer.LoadFromSerializedProto(sp_model)
|
tokenizer.LoadFromSerializedProto(sp_model)
|
||||||
|
|
|
@ -114,7 +114,9 @@ class T5TokenizerFast(PreTrainedTokenizerFast):
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
"You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers"
|
"You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers"
|
||||||
)
|
)
|
||||||
kwargs["from_slow"] = True
|
# kwargs["from_slow"] = True
|
||||||
|
|
||||||
|
self.add_prefix_space = add_prefix_space
|
||||||
|
|
||||||
super().__init__(
|
super().__init__(
|
||||||
vocab_file,
|
vocab_file,
|
||||||
|
@ -123,6 +125,7 @@ class T5TokenizerFast(PreTrainedTokenizerFast):
|
||||||
unk_token=unk_token,
|
unk_token=unk_token,
|
||||||
pad_token=pad_token,
|
pad_token=pad_token,
|
||||||
extra_ids=extra_ids,
|
extra_ids=extra_ids,
|
||||||
|
add_prefix_space=add_prefix_space,
|
||||||
additional_special_tokens=additional_special_tokens,
|
additional_special_tokens=additional_special_tokens,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
|
@ -607,6 +607,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
|
||||||
f" to the tokenizer. This means that it is not an `AddedToken` but a {type(tok_extended)}"
|
f" to the tokenizer. This means that it is not an `AddedToken` but a {type(tok_extended)}"
|
||||||
)
|
)
|
||||||
# ["This is something", "<special_token_1>", "else"]
|
# ["This is something", "<special_token_1>", "else"]
|
||||||
|
import json
|
||||||
tokenized_text = []
|
tokenized_text = []
|
||||||
for token in tokens:
|
for token in tokens:
|
||||||
# Need to skip eventual empty (fully stripped) tokens
|
# Need to skip eventual empty (fully stripped) tokens
|
||||||
|
|
|
@ -23,6 +23,7 @@ import os
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
|
from tokenizers import pre_tokenizers, normalizers, processors
|
||||||
import tokenizers.pre_tokenizers as pre_tokenizers_fast
|
import tokenizers.pre_tokenizers as pre_tokenizers_fast
|
||||||
from tokenizers import Encoding as EncodingFast
|
from tokenizers import Encoding as EncodingFast
|
||||||
from tokenizers import Tokenizer as TokenizerFast
|
from tokenizers import Tokenizer as TokenizerFast
|
||||||
|
@ -102,15 +103,16 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
|
||||||
from_slow = kwargs.pop("from_slow", False)
|
from_slow = kwargs.pop("from_slow", False)
|
||||||
added_tokens_decoder = kwargs.pop("added_tokens_decoder", {})
|
added_tokens_decoder = kwargs.pop("added_tokens_decoder", {})
|
||||||
|
|
||||||
if from_slow and slow_tokenizer is None and self.slow_tokenizer_class is None:
|
# TODO:Ita
|
||||||
raise ValueError(
|
# if from_slow and slow_tokenizer is None and self.slow_tokenizer_class is None:
|
||||||
"Cannot instantiate this tokenizer from a slow version. If it's based on sentencepiece, make sure you "
|
# raise ValueError(
|
||||||
"have sentencepiece installed."
|
# "Cannot instantiate this tokenizer from a slow version. If it's based on sentencepiece, make sure you "
|
||||||
)
|
# "have sentencepiece installed."
|
||||||
|
# )
|
||||||
|
|
||||||
if tokenizer_object is not None:
|
if tokenizer_object is not None:
|
||||||
fast_tokenizer = copy.deepcopy(tokenizer_object)
|
fast_tokenizer = copy.deepcopy(tokenizer_object)
|
||||||
elif fast_tokenizer_file is not None and not from_slow:
|
elif fast_tokenizer_file is not None: # and not from_slow:
|
||||||
# We have a serialization from tokenizers which let us directly build the backend
|
# We have a serialization from tokenizers which let us directly build the backend
|
||||||
fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
|
fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
|
||||||
elif slow_tokenizer is not None:
|
elif slow_tokenizer is not None:
|
||||||
|
@ -124,6 +126,11 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
|
||||||
# We need to create and convert a slow tokenizer to build the backend
|
# We need to create and convert a slow tokenizer to build the backend
|
||||||
slow_tokenizer = self.slow_tokenizer_class(*args, **kwargs)
|
slow_tokenizer = self.slow_tokenizer_class(*args, **kwargs)
|
||||||
fast_tokenizer = convert_slow_tokenizer(slow_tokenizer)
|
fast_tokenizer = convert_slow_tokenizer(slow_tokenizer)
|
||||||
|
elif slow_tokenizer is not None:
|
||||||
|
# We need to convert a slow tokenizer to build the backend
|
||||||
|
tokenizer_dict = load_gguf_checkpoint(kwargs.get("vocab_file"))["tokenizer"]
|
||||||
|
fast_tokenizer = convert_gguf_tokenizer(tokenizer_dict)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Couldn't instantiate the backend tokenizer from one of: \n"
|
"Couldn't instantiate the backend tokenizer from one of: \n"
|
||||||
|
@ -135,6 +142,8 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
|
||||||
|
|
||||||
self._tokenizer = fast_tokenizer
|
self._tokenizer = fast_tokenizer
|
||||||
|
|
||||||
|
self.update_pre_tokenizer()
|
||||||
|
|
||||||
if slow_tokenizer is not None:
|
if slow_tokenizer is not None:
|
||||||
kwargs.update(slow_tokenizer.init_kwargs)
|
kwargs.update(slow_tokenizer.init_kwargs)
|
||||||
|
|
||||||
|
@ -861,3 +870,46 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
|
||||||
kwargs["additional_special_tokens"] = additional_special_tokens
|
kwargs["additional_special_tokens"] = additional_special_tokens
|
||||||
|
|
||||||
return self.__class__(tokenizer_object=tokenizer, **kwargs)
|
return self.__class__(tokenizer_object=tokenizer, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def update_normalizer(self):
|
||||||
|
"""Updates the underlying normalizer with the current `add_prefix_space` and `legacy` settings."""
|
||||||
|
sequence = []
|
||||||
|
if getattr(self, "legacy", True):
|
||||||
|
if getattr(self, "add_prefix_space", True):
|
||||||
|
sequence += [normalizers.Prepend(prepend="▁")]
|
||||||
|
sequence += [normalizers.Replace(pattern=" ", content="▁")]
|
||||||
|
self._tokenizer.normalizer = normalizers.Sequence(sequence)
|
||||||
|
|
||||||
|
elif not getattr(self, "legacy", True):
|
||||||
|
return
|
||||||
|
self._tokenizer.normalizer = normalizers.Sequence(sequence) #TODO:ita2
|
||||||
|
|
||||||
|
|
||||||
|
def update_pre_tokenizer(self):
|
||||||
|
"""Updates the underlying pre-tokenizer with the current `add_prefix_space` setting."""
|
||||||
|
sequence = []
|
||||||
|
if getattr(self, "add_prefix_space", None) == None:
|
||||||
|
if getattr(self._tokenizer, "normalizer", None) == None:
|
||||||
|
return
|
||||||
|
curr_normalizer = json.loads(self._tokenizer.normalizer.__getstate__().decode('utf-8'))
|
||||||
|
if 'normalizers' not in curr_normalizer:
|
||||||
|
return
|
||||||
|
prepend_normalizer = [n for n in curr_normalizer['normalizers'] if n['type'] == 'Prepend']
|
||||||
|
if prepend_normalizer:
|
||||||
|
prepend_normalizer = prepend_normalizer[0]
|
||||||
|
replacement = prepend_normalizer['prepend']
|
||||||
|
self.add_prefix_space = True
|
||||||
|
else:
|
||||||
|
return
|
||||||
|
elif getattr(self, "add_prefix_space") == False:
|
||||||
|
prepend_scheme = "never"
|
||||||
|
|
||||||
|
if getattr(self, "add_prefix_space", True):
|
||||||
|
prepend_scheme = "always"
|
||||||
|
if not getattr(self, "legacy", True):
|
||||||
|
prepend_scheme = "first"
|
||||||
|
self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement="▁", prepend_scheme=prepend_scheme,
|
||||||
|
split=False)
|
||||||
|
self.update_normalizer()
|
||||||
|
|
Loading…
Reference in New Issue